- c - 在位数组中找到第一个零
- linux - Unix 显示有关匹配两种模式之一的文件的信息
- 正则表达式替换多个文件
- linux - 隐藏来自 xtrace 的命令
这是 godbolt 生成的代码.
下面是 Visual Studio 在我的 main.asm 文件上生成的相同代码(通过 Project->C/C++->Output Files->Assembly With Source Code (/FAs) 在 Assembler Output 字段下启用):
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.15.26732.1
TITLE c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB OLDNAMES
EXTRN __imp____std_terminate:PROC
EXTRN @__security_check_cookie@4:PROC
EXTRN __imp____CxxFrameHandler3:PROC
PUBLIC ??$?RABNABN@?$less@X@std@@QBE_NABN0@Z ; std::less<void>::operator()<double const &,double const &>
PUBLIC ??$clamp@NU?$less@X@std@@@std@@YAABNABN00U?$less@X@0@@Z ; std::clamp<double,std::less<void> >
PUBLIC ??$clamp@N@std@@YAABNABN00@Z ; std::clamp<double>
PUBLIC _main
PUBLIC ?ProcessOptimized@MyPlugin@@QAEXH@Z ; MyPlugin::ProcessOptimized
PUBLIC ?Process@MyPlugin@@QAEXH@Z ; MyPlugin::Process
PUBLIC ??1MyPlugin@@QAE@XZ ; MyPlugin::~MyPlugin
PUBLIC ??0MyPlugin@@QAE@XZ ; MyPlugin::MyPlugin
PUBLIC ?ProcessOptimized@Param@@QAEXHH@Z ; Param::ProcessOptimized
PUBLIC ?Process@Param@@QAEXHH@Z ; Param::Process
PUBLIC ??0Param@@QAE@XZ ; Param::Param
PUBLIC __real@3ff0000000000000
PUBLIC __real@400921fb54442d18
PUBLIC __real@4024000000000000
PUBLIC __real@406fe00000000000
PUBLIC __xmm@00000003000000020000000100000000
PUBLIC __xmm@400921fb54442d18400921fb54442d18
PUBLIC __xmm@406fe00000000000406fe00000000000
EXTRN __chkstk:PROC
EXTRN ___security_cookie:DWORD
EXTRN __fltused:DWORD
; COMDAT __xmm@406fe00000000000406fe00000000000
CONST SEGMENT
__xmm@406fe00000000000406fe00000000000 DB 00H, 00H, 00H, 00H, 00H, 0e0H, 'o'
DB '@', 00H, 00H, 00H, 00H, 00H, 0e0H, 'o@'
CONST ENDS
; COMDAT __xmm@400921fb54442d18400921fb54442d18
CONST SEGMENT
__xmm@400921fb54442d18400921fb54442d18 DB 018H, '-DT', 0fbH, '!', 09H, '@'
DB 018H, '-DT', 0fbH, '!', 09H, '@'
CONST ENDS
; COMDAT __xmm@00000003000000020000000100000000
CONST SEGMENT
__xmm@00000003000000020000000100000000 DB 00H, 00H, 00H, 00H, 01H, 00H, 00H
DB 00H, 02H, 00H, 00H, 00H, 03H, 00H, 00H, 00H
CONST ENDS
; COMDAT __real@406fe00000000000
CONST SEGMENT
__real@406fe00000000000 DQ 0406fe00000000000r ; 255
CONST ENDS
; COMDAT __real@4024000000000000
CONST SEGMENT
__real@4024000000000000 DQ 04024000000000000r ; 10
CONST ENDS
; COMDAT __real@400921fb54442d18
CONST SEGMENT
__real@400921fb54442d18 DQ 0400921fb54442d18r ; 3.14159
CONST ENDS
; COMDAT __real@3ff0000000000000
CONST SEGMENT
__real@3ff0000000000000 DQ 03ff0000000000000r ; 1
CONST ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??0Param@@QAE@XZ
_TEXT SEGMENT
??0Param@@QAE@XZ PROC ; Param::Param, COMDAT
; _this$ = ecx
; 23 : Param() { }
xorps xmm0, xmm0
mov eax, ecx
movsd QWORD PTR [ecx], xmm0
movsd QWORD PTR [ecx+16], xmm0
movsd xmm0, QWORD PTR __real@4024000000000000
movsd QWORD PTR [ecx+32], xmm0
movsd xmm0, QWORD PTR __real@3ff0000000000000
movsd QWORD PTR [ecx+48], xmm0
movsd QWORD PTR [ecx+64], xmm0
ret 0
??0Param@@QAE@XZ ENDP ; Param::Param
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?Process@Param@@QAEXHH@Z
_TEXT SEGMENT
$T1 = -24 ; size = 8
$T3 = -16 ; size = 8
$T2 = -8 ; size = 8
_voiceIndex$ = 8 ; size = 4
_blockSize$dead$ = 12 ; size = 4
?Process@Param@@QAEXHH@Z PROC ; Param::Process, COMDAT
; _this$ = ecx
; 25 : inline void Process(int voiceIndex, int blockSize) {
push ebp
mov ebp, esp
sub esp, 24 ; 00000018H
; 26 : double *pB = b[voiceIndex];
mov eax, DWORD PTR _voiceIndex$[ebp]
xorps xmm5, xmm5
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm2, QWORD PTR __real@400921fb54442d18
push esi
mov esi, ecx
shl eax, 11 ; 0000000bH
push edi
movsd QWORD PTR $T1[ebp], xmm2
mov ecx, 256 ; 00000100H
movsd QWORD PTR $T2[ebp], xmm5
movsd xmm3, QWORD PTR [esi+48]
lea edx, DWORD PTR [esi+2128]
movsd xmm1, QWORD PTR [esi]
add edx, eax
mulsd xmm3, QWORD PTR [esi+32]
movsd xmm4, QWORD PTR [esi+64]
npad 11
$LL4@Process:
movsd xmm0, QWORD PTR [edx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [edx]
mulsd xmm0, xmm4
comisd xmm0, xmm2
movsd QWORD PTR $T3[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN10@Process
movaps xmm0, xmm2
jmp SHORT $LN11@Process
$LN10@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T2[ebp]
lea edi, DWORD PTR $T3[ebp]
cmovbe eax, edi
movsd xmm0, QWORD PTR [eax]
$LN11@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; 31 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
add edx, 8
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
addsd xmm1, xmm0
sub ecx, 1
jne SHORT $LL4@Process
; 35 : }
; 36 :
; 37 : mPhase = phase;
; 38 : }
pop edi
movsd QWORD PTR [esi], xmm1
pop esi
mov esp, ebp
pop ebp
ret 8
?Process@Param@@QAEXHH@Z ENDP ; Param::Process
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?ProcessOptimized@Param@@QAEXHH@Z
_TEXT SEGMENT
_v_phase$ = -16 ; size = 16
_voiceIndex$ = 8 ; size = 4
_blockSize$dead$ = 12 ; size = 4
?ProcessOptimized@Param@@QAEXHH@Z PROC ; Param::ProcessOptimized, COMDAT
; _this$ = ecx
; 39 : inline void ProcessOptimized(int voiceIndex, int blockSize) {
push ebx
mov ebx, esp
sub esp, 8
and esp, -16 ; fffffff0H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov DWORD PTR [esp+4], ebp
mov ebp, esp
; 40 : double *pB = b[voiceIndex];
mov eax, DWORD PTR _voiceIndex$[ebx]
mov edx, ecx
shl eax, 11 ; 0000000bH
xorps xmm3, xmm3
xorps xmm2, xmm2
sub esp, 16 ; 00000010H
xorps xmm7, xmm7
mov ecx, 128 ; 00000080H
; 41 : double *pC = c[voiceIndex];
; 42 : double phase = mPhaseOptimized;
; 43 : double bp0 = mNoteFrequency * mHostPitch;
movsd xmm5, QWORD PTR [edx+48]
mulsd xmm5, QWORD PTR [edx+32]
; 44 :
; 45 : __m128d v_boundLower = _mm_set1_pd(0.0);
; 46 : __m128d v_boundUpper = _mm_set1_pd(PI);
; 47 : __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
movsd xmm6, QWORD PTR [edx+64]
; 48 : __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49 :
; 50 : __m128d v_pB0 = _mm_load_pd(pB);
; 51 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 52 : __m128d v_pC0 = _mm_load_pd(pC);
; 53 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 54 :
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR [eax+edx+80]
movups xmm4, XMMWORD PTR [eax+edx+80]
movups xmm1, XMMWORD PTR [eax+edx+2128]
mulsd xmm5, xmm6
unpcklpd xmm3, xmm0
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm0, QWORD PTR [eax+edx+2128]
add eax, 2136 ; 00000858H
unpcklpd xmm2, xmm0
add eax, edx
; 58 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
; 59 :
; 60 : __m128d v_phaseAcc1;
; 61 : __m128d v_phaseAcc2;
; 62 : __m128d v_phase = _mm_set1_pd(phase);
movsd xmm0, QWORD PTR [edx+16]
unpcklpd xmm5, xmm5
unpcklpd xmm6, xmm6
mulpd xmm4, xmm5
mulpd xmm1, xmm6
mulpd xmm3, xmm5
mulpd xmm2, xmm6
unpcklpd xmm0, xmm0
npad 2
$LL4@ProcessOpt:
; 63 :
; 64 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65 : // some other code (that will use phase, like sin(phase))
; 66 :
; 67 : v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);
addpd xmm1, xmm4
; 68 : v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69 : v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70 : v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71 : v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72 : v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73 : v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74 : v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75 :
; 76 : v_pB0 = _mm_load_pd(pB + 2);
movups xmm4, XMMWORD PTR [eax-2040]
addpd xmm2, xmm3
; 77 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78 : v_pC0 = _mm_load_pd(pC + 2);
; 79 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80 :
; 81 : v_pB1 = _mm_loadu_pd(pB + 1);
movups xmm3, XMMWORD PTR [eax-2048]
maxpd xmm1, xmm7
maxpd xmm2, xmm7
minpd xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
minpd xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
addpd xmm0, xmm1
movups xmm1, XMMWORD PTR [eax+8]
addpd xmm0, xmm2
; 82 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83 : v_pC1 = _mm_loadu_pd(pC + 1);
movups xmm2, XMMWORD PTR [eax]
add eax, 16 ; 00000010H
movaps XMMWORD PTR _v_phase$[ebp], xmm0
mulpd xmm4, xmm5
mulpd xmm1, xmm6
mulpd xmm3, xmm5
; 84 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm6
sub ecx, 1
jne SHORT $LL4@ProcessOpt
; 85 : }
; 86 :
; 87 : mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
movsd xmm0, QWORD PTR _v_phase$[ebp+8]
movsd QWORD PTR [edx+16], xmm0
; 88 : }
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret 8
?ProcessOptimized@Param@@QAEXHH@Z ENDP ; Param::ProcessOptimized
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??0MyPlugin@@QAE@XZ
_TEXT SEGMENT
??0MyPlugin@@QAE@XZ PROC ; MyPlugin::MyPlugin, COMDAT
; _this$ = ecx
; 97 : // fill b
; 98 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
movaps xmm2, XMMWORD PTR __xmm@00000003000000020000000100000000
xorps xmm0, xmm0
movaps xmm3, XMMWORD PTR __xmm@406fe00000000000406fe00000000000
xor edx, edx
push esi
mov esi, ecx
push edi
; 14 : alignas(16) double mPhase = 0.0;
movsd QWORD PTR [esi], xmm0
; 97 : // fill b
; 98 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR [esi+88]
; 15 : alignas(16) double mPhaseOptimized = 0.0;
movsd QWORD PTR [esi+16], xmm0
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd xmm0, QWORD PTR __real@4024000000000000
movsd QWORD PTR [esi+32], xmm0
; 17 : alignas(16) double mHostPitch = 1.0;
movsd xmm0, QWORD PTR __real@3ff0000000000000
movsd QWORD PTR [esi+48], xmm0
; 18 : alignas(16) double mRadiansPerSample = 1.0;
movsd QWORD PTR [esi+64], xmm0
$LL7@MyPlugin:
; 100 : double value = (sampleIndex / ((double)bufferSize - 1));
movd xmm0, edx
lea eax, DWORD PTR [edx+2]
pshufd xmm1, xmm0, 0
lea ecx, DWORD PTR [ecx+32]
movq xmm0, xmm2
add edx, 4
paddd xmm1, xmm0
cvtdq2pd xmm0, xmm1
divpd xmm0, xmm3
; 101 :
; 102 : mParam1.b[voiceIndex][sampleIndex] = value;
movlpd QWORD PTR [ecx-40], xmm0
movhpd QWORD PTR [ecx-32], xmm0
movd xmm0, eax
pshufd xmm1, xmm0, 0
movq xmm0, xmm2
paddd xmm1, xmm0
cvtdq2pd xmm0, xmm1
divpd xmm0, xmm3
movlpd QWORD PTR [ecx-24], xmm0
movhpd QWORD PTR [ecx-16], xmm0
cmp edx, 256 ; 00000100H
jl SHORT $LL7@MyPlugin
; 103 : }
; 104 : }
; 105 :
; 106 : // fill c
; 107 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea edi, DWORD PTR [esi+2128]
xor eax, eax
mov ecx, 512 ; 00000200H
rep stosd
; 109 : double value = 0.0;
; 110 :
; 111 : mParam1.c[voiceIndex][sampleIndex] = value;
; 112 : }
; 113 : }
; 114 : }
pop edi
mov eax, esi
pop esi
ret 0
??0MyPlugin@@QAE@XZ ENDP ; MyPlugin::MyPlugin
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??1MyPlugin@@QAE@XZ
_TEXT SEGMENT
??1MyPlugin@@QAE@XZ PROC ; MyPlugin::~MyPlugin, COMDAT
; _this$dead$ = ecx
; 115 : ~MyPlugin() { }
ret 0
??1MyPlugin@@QAE@XZ ENDP ; MyPlugin::~MyPlugin
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?Process@MyPlugin@@QAEXH@Z
_TEXT SEGMENT
$T2 = -28 ; size = 8
$T4 = -20 ; size = 8
$T3 = -12 ; size = 8
_blockSize$dead$ = 8 ; size = 4
?Process@MyPlugin@@QAEXH@Z PROC ; MyPlugin::Process, COMDAT
; _this$ = ecx
; 117 : void Process(int blockSize) {
push ebp
mov ebp, esp
sub esp, 28 ; 0000001cH
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm2, QWORD PTR __real@400921fb54442d18
xorps xmm5, xmm5
; 117 : void Process(int blockSize) {
push esi
mov esi, ecx
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T2[ebp], xmm2
; 117 : void Process(int blockSize) {
push edi
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T3[ebp], xmm5
mov edx, 256 ; 00000100H
movsd xmm3, QWORD PTR [esi+48]
; 27 : double *pC = c[voiceIndex];
lea ecx, DWORD PTR [esi+2128]
; 28 : double phase = mPhase;
; 29 : double bp0 = mNoteFrequency * mHostPitch;
movsd xmm1, QWORD PTR [esi]
mulsd xmm3, QWORD PTR [esi+32]
movsd xmm4, QWORD PTR [esi+64]
npad 3
$LL9@Process:
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm0, QWORD PTR [ecx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [ecx]
mulsd xmm0, xmm4
comisd xmm0, xmm2
movsd QWORD PTR $T4[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN15@Process
movaps xmm0, xmm2
jmp SHORT $LN16@Process
$LN15@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T3[ebp]
lea edi, DWORD PTR $T4[ebp]
cmovbe eax, edi
movsd xmm0, QWORD PTR [eax]
$LN16@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; 31 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
add ecx, 8
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
addsd xmm1, xmm0
sub edx, 1
jne SHORT $LL9@Process
; 118 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119 : mParam1.Process(voiceIndex, blockSize);
; 120 : }
; 121 : }
pop edi
; 37 : mPhase = phase;
movsd QWORD PTR [esi], xmm1
; 118 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119 : mParam1.Process(voiceIndex, blockSize);
; 120 : }
; 121 : }
pop esi
mov esp, ebp
pop ebp
ret 4
?Process@MyPlugin@@QAEXH@Z ENDP ; MyPlugin::Process
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?ProcessOptimized@MyPlugin@@QAEXH@Z
_TEXT SEGMENT
_v_phase$31 = -16 ; size = 16
_blockSize$dead$ = 8 ; size = 4
?ProcessOptimized@MyPlugin@@QAEXH@Z PROC ; MyPlugin::ProcessOptimized, COMDAT
; _this$ = ecx
; 122 : void ProcessOptimized(int blockSize) {
push ebx
mov ebx, esp
sub esp, 8
and esp, -16 ; fffffff0H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov DWORD PTR [esp+4], ebp
mov ebp, esp
mov edx, ecx
xorps xmm3, xmm3
xorps xmm2, xmm2
sub esp, 16 ; 00000010H
; 40 : double *pB = b[voiceIndex];
mov ecx, 128 ; 00000080H
movsd xmm6, QWORD PTR [edx+48]
lea eax, DWORD PTR [edx+2136]
mulsd xmm6, QWORD PTR [edx+32]
; 41 : double *pC = c[voiceIndex];
; 42 : double phase = mPhaseOptimized;
; 43 : double bp0 = mNoteFrequency * mHostPitch;
; 44 :
; 45 : __m128d v_boundLower = _mm_set1_pd(0.0);
; 46 : __m128d v_boundUpper = _mm_set1_pd(PI);
; 47 : __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
movsd xmm7, QWORD PTR [edx+64]
; 54 :
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR [edx+80]
movsd xmm5, QWORD PTR [edx+16]
movups xmm4, XMMWORD PTR [edx+80]
movups xmm1, XMMWORD PTR [edx+2128]
mulsd xmm6, xmm7
unpcklpd xmm3, xmm0
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm0, QWORD PTR [edx+2128]
unpcklpd xmm7, xmm7
unpcklpd xmm6, xmm6
unpcklpd xmm2, xmm0
xorps xmm0, xmm0
; 48 : __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49 :
; 50 : __m128d v_pB0 = _mm_load_pd(pB);
; 51 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
mulpd xmm4, xmm6
; 52 : __m128d v_pC0 = _mm_load_pd(pC);
; 53 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
mulpd xmm1, xmm7
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
mulpd xmm3, xmm6
; 58 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm7
; 59 :
; 60 : __m128d v_phaseAcc1;
; 61 : __m128d v_phaseAcc2;
; 62 : __m128d v_phase = _mm_set1_pd(phase);
unpcklpd xmm5, xmm5
npad 13
$LL9@ProcessOpt:
; 63 :
; 64 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65 : // some other code (that will use phase, like sin(phase))
; 66 :
; 67 : v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);
addpd xmm1, xmm4
; 68 : v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69 : v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70 : v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71 : v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72 : v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73 : v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74 : v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75 :
; 76 : v_pB0 = _mm_load_pd(pB + 2);
movups xmm4, XMMWORD PTR [eax-2040]
addpd xmm2, xmm3
; 77 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78 : v_pC0 = _mm_load_pd(pC + 2);
; 79 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80 :
; 81 : v_pB1 = _mm_loadu_pd(pB + 1);
movups xmm3, XMMWORD PTR [eax-2048]
maxpd xmm1, xmm0
maxpd xmm2, xmm0
minpd xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
minpd xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
addpd xmm5, xmm1
movups xmm1, XMMWORD PTR [eax+8]
addpd xmm5, xmm2
; 82 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83 : v_pC1 = _mm_loadu_pd(pC + 1);
movups xmm2, XMMWORD PTR [eax]
add eax, 16 ; 00000010H
movaps XMMWORD PTR _v_phase$31[ebp], xmm5
mulpd xmm4, xmm6
mulpd xmm1, xmm7
mulpd xmm3, xmm6
; 84 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm7
sub ecx, 1
jne SHORT $LL9@ProcessOpt
; 85 : }
; 86 :
; 87 : mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
movsd xmm0, QWORD PTR _v_phase$31[ebp+8]
movsd QWORD PTR [edx+16], xmm0
; 123 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 124 : mParam1.ProcessOptimized(voiceIndex, blockSize);
; 125 : }
; 126 : }
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret 4
?ProcessOptimized@MyPlugin@@QAEXH@Z ENDP ; MyPlugin::ProcessOptimized
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT _main
_TEXT SEGMENT
_counterProcessing$1$ = -4304 ; size = 4
_counterProcessing$ = -4304 ; size = 8
_bp0$1$ = -4296 ; size = 8
_v_radiansPerSample$1$ = -4288 ; size = 16
$T3 = -4264 ; size = 8
_v_phase$38 = -4256 ; size = 16
$T4 = -4256 ; size = 8
$T2 = -4232 ; size = 8
tv1040 = -4224 ; size = 16
tv1039 = -4208 ; size = 16
_myPlugin$ = -4192 ; size = 4176
__$ArrayPad$ = -4 ; size = 4
_main PROC ; COMDAT
; 129 : int main() {
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
mov eax, 4312 ; 000010d8H
call __chkstk
mov eax, DWORD PTR ___security_cookie
xor eax, esp
mov DWORD PTR __$ArrayPad$[esp+4312], eax
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd xmm0, QWORD PTR __real@4024000000000000
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR _myPlugin$[esp+4392]
movsd xmm1, QWORD PTR __real@406fe00000000000
xorps xmm2, xmm2
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd QWORD PTR _myPlugin$[esp+4344], xmm0
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
xor eax, eax
; 17 : alignas(16) double mHostPitch = 1.0;
movsd xmm0, QWORD PTR __real@3ff0000000000000
; 129 : int main() {
push esi
push edi
; 14 : alignas(16) double mPhase = 0.0;
movsd QWORD PTR _myPlugin$[esp+4320], xmm2
; 15 : alignas(16) double mPhaseOptimized = 0.0;
movsd QWORD PTR _myPlugin$[esp+4336], xmm2
; 17 : alignas(16) double mHostPitch = 1.0;
movsd QWORD PTR _myPlugin$[esp+4368], xmm0
; 18 : alignas(16) double mRadiansPerSample = 1.0;
movsd QWORD PTR _myPlugin$[esp+4384], xmm0
$LL11@main:
movd xmm0, eax
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR [ecx+8]
; 100 : double value = (sampleIndex / ((double)bufferSize - 1));
cvtdq2pd xmm0, xmm0
inc eax
divsd xmm0, xmm1
; 101 :
; 102 : mParam1.b[voiceIndex][sampleIndex] = value;
movsd QWORD PTR [ecx-8], xmm0
cmp eax, 256 ; 00000100H
jl SHORT $LL11@main
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm6, QWORD PTR __real@400921fb54442d18
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea edi, DWORD PTR _myPlugin$[esp+6448]
mov ecx, 512 ; 00000200H
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T2[esp+4320], xmm6
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
xor eax, eax
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T3[esp+4320], xmm2
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
rep stosd
movsd xmm3, QWORD PTR _myPlugin$[esp+4352]
xorps xmm0, xmm0
mulsd xmm3, QWORD PTR _myPlugin$[esp+4368]
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movaps xmm4, xmm2
movsd xmm1, QWORD PTR _myPlugin$[esp+4384]
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm5, QWORD PTR _myPlugin$[esp+4336]
; 130 : MyPlugin myPlugin;
; 131 :
; 132 : long long numProcessing = 5;
; 133 : long long counterProcessing = 0;
movlpd QWORD PTR _counterProcessing$[esp+4320], xmm0
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR _myPlugin$[esp+4400]
movaps xmm7, xmm3
mulsd xmm7, QWORD PTR _myPlugin$[esp+4384]
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
mov edi, DWORD PTR _counterProcessing$[esp+4324]
mov esi, DWORD PTR _counterProcessing$[esp+4320]
unpcklpd xmm4, xmm0
movsd xmm0, QWORD PTR _myPlugin$[esp+6448]
movups XMMWORD PTR tv1040[esp+4320], xmm4
movaps xmm4, xmm2
unpcklpd xmm1, xmm1
unpcklpd xmm4, xmm0
movups XMMWORD PTR tv1039[esp+4320], xmm4
movsd xmm4, QWORD PTR _myPlugin$[esp+4320]
movsd QWORD PTR _bp0$1$[esp+4320], xmm3
unpcklpd xmm7, xmm7
movaps XMMWORD PTR _v_radiansPerSample$1$[esp+4320], xmm1
npad 8
$LL2@main:
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
add esi, 1
; 26 : double *pB = b[voiceIndex];
lea ecx, DWORD PTR _myPlugin$[esp+6448]
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
mov DWORD PTR _counterProcessing$1$[esp+4320], esi
; 26 : double *pB = b[voiceIndex];
mov edx, 256 ; 00000100H
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
adc edi, 0
npad 10
$LL29@main:
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm0, QWORD PTR [ecx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [ecx]
mulsd xmm0, QWORD PTR _myPlugin$[esp+4384]
comisd xmm0, xmm6
movsd QWORD PTR $T4[esp+4320], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN35@main
movaps xmm0, xmm6
jmp SHORT $LN36@main
$LN35@main:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm2, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T3[esp+4320]
lea esi, DWORD PTR $T4[esp+4320]
cmovbe eax, esi
movsd xmm0, QWORD PTR [eax]
// ...
(注意:我删除了一些行,因为 StackOverflow 对其进行了限制。)
这很不一样。另外,我看到 VS 生成的代码有点多余,即搜索字符串 phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
:有很多。
我缺少哪些设置?。我在 X86 构建上匹配了相同的 MSVC 版本 (19.15),同时也放置了我拥有的实际优化。
最佳答案
您似乎没有使用相同的编译器标志。来自 Visual Studio 的程序集转储显示每个函数都使用标志 /Ogtp
进行了优化,当您在命令行中指定 /Og
时,这些标志会在内部使用。另一方面,在 Godbolt 版本中,您使用了 /Ot/O2
,它在内部对应于 /Ogtpy
。如果我手动添加 /Oy
标志,代码会略有不同,但仍然与 Visual Studio 生成的代码不同。
我意识到编译器版本并不完全相同,但 19.15.26726.0 和 19.15.26732.1 之间的差异非常小,可能仅包括错误修复。我认为还有其他不同的标志。您可以转到项目的属性页,并在“所有选项”和“其他选项” Pane 中找到已使用的所有编译器选项。在 Release 版本中,除了 /arch:SSE2/Ot/O2
之外,还使用了许多选项。注意 /arch:SSE2
is the default ,因此您不必明确指定它。另外,/O2
implies /Ot
.所以 /arch:SSE2/Ot/O2
等同于 /O2
。
关于c++ - 为什么 godbolt 生成的 asm 输出与我在 Visual Studio 中的实际 asm 代码不同?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54035228/
我想做的是让 JTextPane 在 JPanel 中占用尽可能多的空间。对于我使用的 UpdateInfoPanel: public class UpdateInfoPanel extends JP
我在 JPanel 中有一个 JTextArea,我想将其与 JScrollPane 一起使用。我正在使用 GridBagLayout。当我运行它时,框架似乎为 JScrollPane 腾出了空间,但
我想在 xcode 中实现以下功能。 我有一个 View Controller 。在这个 UIViewController 中,我有一个 UITabBar。它们下面是一个 UIView。将 UITab
有谁知道Firebird 2.5有没有类似于SQL中“STUFF”函数的功能? 我有一个包含父用户记录的表,另一个表包含与父相关的子用户记录。我希望能够提取用户拥有的“ROLES”的逗号分隔字符串,而
我想使用 JSON 作为 mirth channel 的输入和输出,例如详细信息保存在数据库中或创建 HL7 消息。 简而言之,输入为 JSON 解析它并输出为任何格式。 最佳答案 var objec
通常我会使用 R 并执行 merge.by,但这个文件似乎太大了,部门中的任何一台计算机都无法处理它! (任何从事遗传学工作的人的附加信息)本质上,插补似乎删除了 snp ID 的 rs 数字,我只剩
我有一个以前可能被问过的问题,但我很难找到正确的描述。我希望有人能帮助我。 在下面的代码中,我设置了varprice,我想添加javascript变量accu_id以通过rails在我的数据库中查找记
我有一个简单的 SVG 文件,在 Firefox 中可以正常查看 - 它的一些包装文本使用 foreignObject 包含一些 HTML - 文本包装在 div 中:
所以我正在为学校编写一个 Ruby 程序,如果某个值是 1 或 3,则将 bool 值更改为 true,如果是 0 或 2,则更改为 false。由于我有 Java 背景,所以我认为这段代码应该有效:
我做了什么: 我在这些账户之间创建了 VPC 对等连接 互联网网关也连接到每个 VPC 还配置了路由表(以允许来自双方的流量) 情况1: 当这两个 VPC 在同一个账户中时,我成功测试了从另一个 La
我有一个名为 contacts 的表: user_id contact_id 10294 10295 10294 10293 10293 10294 102
我正在使用 Magento 中的新模板。为避免重复代码,我想为每个产品预览使用相同的子模板。 特别是我做了这样一个展示: $products = Mage::getModel('catalog/pro
“for”是否总是检查协议(protocol)中定义的每个函数中第一个参数的类型? 编辑(改写): 当协议(protocol)方法只有一个参数时,根据该单个参数的类型(直接或任意)找到实现。当协议(p
我想从我的 PHP 代码中调用 JavaScript 函数。我通过使用以下方法实现了这一点: echo ' drawChart($id); '; 这工作正常,但我想从我的 PHP 代码中获取数据,我使
这个问题已经有答案了: Event binding on dynamically created elements? (23 个回答) 已关闭 5 年前。 我有一个动态表单,我想在其中附加一些其他 h
我正在尝试找到一种解决方案,以在 componentDidMount 中的映射项上使用 setState。 我正在使用 GraphQL连同 Gatsby返回许多 data 项目,但要求在特定的 pat
我在 ScrollView 中有一个 View 。只要用户按住该 View ,我想每 80 毫秒调用一次方法。这是我已经实现的: final Runnable vibrate = new Runnab
我用 jni 开发了一个 android 应用程序。我在 GetStringUTFChars 的 dvmDecodeIndirectRef 中得到了一个 dvmabort。我只中止了一次。 为什么会这
当我到达我的 Activity 时,我调用 FragmentPagerAdapter 来处理我的不同选项卡。在我的一个选项卡中,我想显示一个 RecyclerView,但他从未出现过,有了断点,我看到
当我按下 Activity 中的按钮时,会弹出一个 DialogFragment。在对话框 fragment 中,有一个看起来像普通 ListView 的 RecyclerView。 我想要的行为是当
我是一名优秀的程序员,十分优秀!