- c - 在位数组中找到第一个零
- linux - Unix 显示有关匹配两种模式之一的文件的信息
- 正则表达式替换多个文件
- linux - 隐藏来自 xtrace 的命令
这是 godbolt 生成的代码.
下面是 Visual Studio 在我的 main.asm 文件上生成的相同代码(通过 Project->C/C++->Output Files->Assembly With Source Code (/FAs) 在 Assembler Output 字段下启用):
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.15.26732.1
TITLE c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB OLDNAMES
EXTRN __imp____std_terminate:PROC
EXTRN @__security_check_cookie@4:PROC
EXTRN __imp____CxxFrameHandler3:PROC
PUBLIC ??$?RABNABN@?$less@X@std@@QBE_NABN0@Z ; std::less<void>::operator()<double const &,double const &>
PUBLIC ??$clamp@NU?$less@X@std@@@std@@YAABNABN00U?$less@X@0@@Z ; std::clamp<double,std::less<void> >
PUBLIC ??$clamp@N@std@@YAABNABN00@Z ; std::clamp<double>
PUBLIC _main
PUBLIC ?ProcessOptimized@MyPlugin@@QAEXH@Z ; MyPlugin::ProcessOptimized
PUBLIC ?Process@MyPlugin@@QAEXH@Z ; MyPlugin::Process
PUBLIC ??1MyPlugin@@QAE@XZ ; MyPlugin::~MyPlugin
PUBLIC ??0MyPlugin@@QAE@XZ ; MyPlugin::MyPlugin
PUBLIC ?ProcessOptimized@Param@@QAEXHH@Z ; Param::ProcessOptimized
PUBLIC ?Process@Param@@QAEXHH@Z ; Param::Process
PUBLIC ??0Param@@QAE@XZ ; Param::Param
PUBLIC __real@3ff0000000000000
PUBLIC __real@400921fb54442d18
PUBLIC __real@4024000000000000
PUBLIC __real@406fe00000000000
PUBLIC __xmm@00000003000000020000000100000000
PUBLIC __xmm@400921fb54442d18400921fb54442d18
PUBLIC __xmm@406fe00000000000406fe00000000000
EXTRN __chkstk:PROC
EXTRN ___security_cookie:DWORD
EXTRN __fltused:DWORD
; COMDAT __xmm@406fe00000000000406fe00000000000
CONST SEGMENT
__xmm@406fe00000000000406fe00000000000 DB 00H, 00H, 00H, 00H, 00H, 0e0H, 'o'
DB '@', 00H, 00H, 00H, 00H, 00H, 0e0H, 'o@'
CONST ENDS
; COMDAT __xmm@400921fb54442d18400921fb54442d18
CONST SEGMENT
__xmm@400921fb54442d18400921fb54442d18 DB 018H, '-DT', 0fbH, '!', 09H, '@'
DB 018H, '-DT', 0fbH, '!', 09H, '@'
CONST ENDS
; COMDAT __xmm@00000003000000020000000100000000
CONST SEGMENT
__xmm@00000003000000020000000100000000 DB 00H, 00H, 00H, 00H, 01H, 00H, 00H
DB 00H, 02H, 00H, 00H, 00H, 03H, 00H, 00H, 00H
CONST ENDS
; COMDAT __real@406fe00000000000
CONST SEGMENT
__real@406fe00000000000 DQ 0406fe00000000000r ; 255
CONST ENDS
; COMDAT __real@4024000000000000
CONST SEGMENT
__real@4024000000000000 DQ 04024000000000000r ; 10
CONST ENDS
; COMDAT __real@400921fb54442d18
CONST SEGMENT
__real@400921fb54442d18 DQ 0400921fb54442d18r ; 3.14159
CONST ENDS
; COMDAT __real@3ff0000000000000
CONST SEGMENT
__real@3ff0000000000000 DQ 03ff0000000000000r ; 1
CONST ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??0Param@@QAE@XZ
_TEXT SEGMENT
??0Param@@QAE@XZ PROC ; Param::Param, COMDAT
; _this$ = ecx
; 23 : Param() { }
xorps xmm0, xmm0
mov eax, ecx
movsd QWORD PTR [ecx], xmm0
movsd QWORD PTR [ecx+16], xmm0
movsd xmm0, QWORD PTR __real@4024000000000000
movsd QWORD PTR [ecx+32], xmm0
movsd xmm0, QWORD PTR __real@3ff0000000000000
movsd QWORD PTR [ecx+48], xmm0
movsd QWORD PTR [ecx+64], xmm0
ret 0
??0Param@@QAE@XZ ENDP ; Param::Param
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?Process@Param@@QAEXHH@Z
_TEXT SEGMENT
$T1 = -24 ; size = 8
$T3 = -16 ; size = 8
$T2 = -8 ; size = 8
_voiceIndex$ = 8 ; size = 4
_blockSize$dead$ = 12 ; size = 4
?Process@Param@@QAEXHH@Z PROC ; Param::Process, COMDAT
; _this$ = ecx
; 25 : inline void Process(int voiceIndex, int blockSize) {
push ebp
mov ebp, esp
sub esp, 24 ; 00000018H
; 26 : double *pB = b[voiceIndex];
mov eax, DWORD PTR _voiceIndex$[ebp]
xorps xmm5, xmm5
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm2, QWORD PTR __real@400921fb54442d18
push esi
mov esi, ecx
shl eax, 11 ; 0000000bH
push edi
movsd QWORD PTR $T1[ebp], xmm2
mov ecx, 256 ; 00000100H
movsd QWORD PTR $T2[ebp], xmm5
movsd xmm3, QWORD PTR [esi+48]
lea edx, DWORD PTR [esi+2128]
movsd xmm1, QWORD PTR [esi]
add edx, eax
mulsd xmm3, QWORD PTR [esi+32]
movsd xmm4, QWORD PTR [esi+64]
npad 11
$LL4@Process:
movsd xmm0, QWORD PTR [edx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [edx]
mulsd xmm0, xmm4
comisd xmm0, xmm2
movsd QWORD PTR $T3[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN10@Process
movaps xmm0, xmm2
jmp SHORT $LN11@Process
$LN10@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T2[ebp]
lea edi, DWORD PTR $T3[ebp]
cmovbe eax, edi
movsd xmm0, QWORD PTR [eax]
$LN11@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; 31 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
add edx, 8
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
addsd xmm1, xmm0
sub ecx, 1
jne SHORT $LL4@Process
; 35 : }
; 36 :
; 37 : mPhase = phase;
; 38 : }
pop edi
movsd QWORD PTR [esi], xmm1
pop esi
mov esp, ebp
pop ebp
ret 8
?Process@Param@@QAEXHH@Z ENDP ; Param::Process
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?ProcessOptimized@Param@@QAEXHH@Z
_TEXT SEGMENT
_v_phase$ = -16 ; size = 16
_voiceIndex$ = 8 ; size = 4
_blockSize$dead$ = 12 ; size = 4
?ProcessOptimized@Param@@QAEXHH@Z PROC ; Param::ProcessOptimized, COMDAT
; _this$ = ecx
; 39 : inline void ProcessOptimized(int voiceIndex, int blockSize) {
push ebx
mov ebx, esp
sub esp, 8
and esp, -16 ; fffffff0H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov DWORD PTR [esp+4], ebp
mov ebp, esp
; 40 : double *pB = b[voiceIndex];
mov eax, DWORD PTR _voiceIndex$[ebx]
mov edx, ecx
shl eax, 11 ; 0000000bH
xorps xmm3, xmm3
xorps xmm2, xmm2
sub esp, 16 ; 00000010H
xorps xmm7, xmm7
mov ecx, 128 ; 00000080H
; 41 : double *pC = c[voiceIndex];
; 42 : double phase = mPhaseOptimized;
; 43 : double bp0 = mNoteFrequency * mHostPitch;
movsd xmm5, QWORD PTR [edx+48]
mulsd xmm5, QWORD PTR [edx+32]
; 44 :
; 45 : __m128d v_boundLower = _mm_set1_pd(0.0);
; 46 : __m128d v_boundUpper = _mm_set1_pd(PI);
; 47 : __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
movsd xmm6, QWORD PTR [edx+64]
; 48 : __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49 :
; 50 : __m128d v_pB0 = _mm_load_pd(pB);
; 51 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 52 : __m128d v_pC0 = _mm_load_pd(pC);
; 53 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 54 :
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR [eax+edx+80]
movups xmm4, XMMWORD PTR [eax+edx+80]
movups xmm1, XMMWORD PTR [eax+edx+2128]
mulsd xmm5, xmm6
unpcklpd xmm3, xmm0
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm0, QWORD PTR [eax+edx+2128]
add eax, 2136 ; 00000858H
unpcklpd xmm2, xmm0
add eax, edx
; 58 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
; 59 :
; 60 : __m128d v_phaseAcc1;
; 61 : __m128d v_phaseAcc2;
; 62 : __m128d v_phase = _mm_set1_pd(phase);
movsd xmm0, QWORD PTR [edx+16]
unpcklpd xmm5, xmm5
unpcklpd xmm6, xmm6
mulpd xmm4, xmm5
mulpd xmm1, xmm6
mulpd xmm3, xmm5
mulpd xmm2, xmm6
unpcklpd xmm0, xmm0
npad 2
$LL4@ProcessOpt:
; 63 :
; 64 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65 : // some other code (that will use phase, like sin(phase))
; 66 :
; 67 : v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);
addpd xmm1, xmm4
; 68 : v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69 : v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70 : v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71 : v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72 : v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73 : v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74 : v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75 :
; 76 : v_pB0 = _mm_load_pd(pB + 2);
movups xmm4, XMMWORD PTR [eax-2040]
addpd xmm2, xmm3
; 77 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78 : v_pC0 = _mm_load_pd(pC + 2);
; 79 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80 :
; 81 : v_pB1 = _mm_loadu_pd(pB + 1);
movups xmm3, XMMWORD PTR [eax-2048]
maxpd xmm1, xmm7
maxpd xmm2, xmm7
minpd xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
minpd xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
addpd xmm0, xmm1
movups xmm1, XMMWORD PTR [eax+8]
addpd xmm0, xmm2
; 82 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83 : v_pC1 = _mm_loadu_pd(pC + 1);
movups xmm2, XMMWORD PTR [eax]
add eax, 16 ; 00000010H
movaps XMMWORD PTR _v_phase$[ebp], xmm0
mulpd xmm4, xmm5
mulpd xmm1, xmm6
mulpd xmm3, xmm5
; 84 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm6
sub ecx, 1
jne SHORT $LL4@ProcessOpt
; 85 : }
; 86 :
; 87 : mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
movsd xmm0, QWORD PTR _v_phase$[ebp+8]
movsd QWORD PTR [edx+16], xmm0
; 88 : }
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret 8
?ProcessOptimized@Param@@QAEXHH@Z ENDP ; Param::ProcessOptimized
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??0MyPlugin@@QAE@XZ
_TEXT SEGMENT
??0MyPlugin@@QAE@XZ PROC ; MyPlugin::MyPlugin, COMDAT
; _this$ = ecx
; 97 : // fill b
; 98 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
movaps xmm2, XMMWORD PTR __xmm@00000003000000020000000100000000
xorps xmm0, xmm0
movaps xmm3, XMMWORD PTR __xmm@406fe00000000000406fe00000000000
xor edx, edx
push esi
mov esi, ecx
push edi
; 14 : alignas(16) double mPhase = 0.0;
movsd QWORD PTR [esi], xmm0
; 97 : // fill b
; 98 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR [esi+88]
; 15 : alignas(16) double mPhaseOptimized = 0.0;
movsd QWORD PTR [esi+16], xmm0
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd xmm0, QWORD PTR __real@4024000000000000
movsd QWORD PTR [esi+32], xmm0
; 17 : alignas(16) double mHostPitch = 1.0;
movsd xmm0, QWORD PTR __real@3ff0000000000000
movsd QWORD PTR [esi+48], xmm0
; 18 : alignas(16) double mRadiansPerSample = 1.0;
movsd QWORD PTR [esi+64], xmm0
$LL7@MyPlugin:
; 100 : double value = (sampleIndex / ((double)bufferSize - 1));
movd xmm0, edx
lea eax, DWORD PTR [edx+2]
pshufd xmm1, xmm0, 0
lea ecx, DWORD PTR [ecx+32]
movq xmm0, xmm2
add edx, 4
paddd xmm1, xmm0
cvtdq2pd xmm0, xmm1
divpd xmm0, xmm3
; 101 :
; 102 : mParam1.b[voiceIndex][sampleIndex] = value;
movlpd QWORD PTR [ecx-40], xmm0
movhpd QWORD PTR [ecx-32], xmm0
movd xmm0, eax
pshufd xmm1, xmm0, 0
movq xmm0, xmm2
paddd xmm1, xmm0
cvtdq2pd xmm0, xmm1
divpd xmm0, xmm3
movlpd QWORD PTR [ecx-24], xmm0
movhpd QWORD PTR [ecx-16], xmm0
cmp edx, 256 ; 00000100H
jl SHORT $LL7@MyPlugin
; 103 : }
; 104 : }
; 105 :
; 106 : // fill c
; 107 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea edi, DWORD PTR [esi+2128]
xor eax, eax
mov ecx, 512 ; 00000200H
rep stosd
; 109 : double value = 0.0;
; 110 :
; 111 : mParam1.c[voiceIndex][sampleIndex] = value;
; 112 : }
; 113 : }
; 114 : }
pop edi
mov eax, esi
pop esi
ret 0
??0MyPlugin@@QAE@XZ ENDP ; MyPlugin::MyPlugin
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??1MyPlugin@@QAE@XZ
_TEXT SEGMENT
??1MyPlugin@@QAE@XZ PROC ; MyPlugin::~MyPlugin, COMDAT
; _this$dead$ = ecx
; 115 : ~MyPlugin() { }
ret 0
??1MyPlugin@@QAE@XZ ENDP ; MyPlugin::~MyPlugin
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?Process@MyPlugin@@QAEXH@Z
_TEXT SEGMENT
$T2 = -28 ; size = 8
$T4 = -20 ; size = 8
$T3 = -12 ; size = 8
_blockSize$dead$ = 8 ; size = 4
?Process@MyPlugin@@QAEXH@Z PROC ; MyPlugin::Process, COMDAT
; _this$ = ecx
; 117 : void Process(int blockSize) {
push ebp
mov ebp, esp
sub esp, 28 ; 0000001cH
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm2, QWORD PTR __real@400921fb54442d18
xorps xmm5, xmm5
; 117 : void Process(int blockSize) {
push esi
mov esi, ecx
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T2[ebp], xmm2
; 117 : void Process(int blockSize) {
push edi
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T3[ebp], xmm5
mov edx, 256 ; 00000100H
movsd xmm3, QWORD PTR [esi+48]
; 27 : double *pC = c[voiceIndex];
lea ecx, DWORD PTR [esi+2128]
; 28 : double phase = mPhase;
; 29 : double bp0 = mNoteFrequency * mHostPitch;
movsd xmm1, QWORD PTR [esi]
mulsd xmm3, QWORD PTR [esi+32]
movsd xmm4, QWORD PTR [esi+64]
npad 3
$LL9@Process:
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm0, QWORD PTR [ecx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [ecx]
mulsd xmm0, xmm4
comisd xmm0, xmm2
movsd QWORD PTR $T4[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN15@Process
movaps xmm0, xmm2
jmp SHORT $LN16@Process
$LN15@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T3[ebp]
lea edi, DWORD PTR $T4[ebp]
cmovbe eax, edi
movsd xmm0, QWORD PTR [eax]
$LN16@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; 31 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
add ecx, 8
; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
addsd xmm1, xmm0
sub edx, 1
jne SHORT $LL9@Process
; 118 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119 : mParam1.Process(voiceIndex, blockSize);
; 120 : }
; 121 : }
pop edi
; 37 : mPhase = phase;
movsd QWORD PTR [esi], xmm1
; 118 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119 : mParam1.Process(voiceIndex, blockSize);
; 120 : }
; 121 : }
pop esi
mov esp, ebp
pop ebp
ret 4
?Process@MyPlugin@@QAEXH@Z ENDP ; MyPlugin::Process
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?ProcessOptimized@MyPlugin@@QAEXH@Z
_TEXT SEGMENT
_v_phase$31 = -16 ; size = 16
_blockSize$dead$ = 8 ; size = 4
?ProcessOptimized@MyPlugin@@QAEXH@Z PROC ; MyPlugin::ProcessOptimized, COMDAT
; _this$ = ecx
; 122 : void ProcessOptimized(int blockSize) {
push ebx
mov ebx, esp
sub esp, 8
and esp, -16 ; fffffff0H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov DWORD PTR [esp+4], ebp
mov ebp, esp
mov edx, ecx
xorps xmm3, xmm3
xorps xmm2, xmm2
sub esp, 16 ; 00000010H
; 40 : double *pB = b[voiceIndex];
mov ecx, 128 ; 00000080H
movsd xmm6, QWORD PTR [edx+48]
lea eax, DWORD PTR [edx+2136]
mulsd xmm6, QWORD PTR [edx+32]
; 41 : double *pC = c[voiceIndex];
; 42 : double phase = mPhaseOptimized;
; 43 : double bp0 = mNoteFrequency * mHostPitch;
; 44 :
; 45 : __m128d v_boundLower = _mm_set1_pd(0.0);
; 46 : __m128d v_boundUpper = _mm_set1_pd(PI);
; 47 : __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);
movsd xmm7, QWORD PTR [edx+64]
; 54 :
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR [edx+80]
movsd xmm5, QWORD PTR [edx+16]
movups xmm4, XMMWORD PTR [edx+80]
movups xmm1, XMMWORD PTR [edx+2128]
mulsd xmm6, xmm7
unpcklpd xmm3, xmm0
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm0, QWORD PTR [edx+2128]
unpcklpd xmm7, xmm7
unpcklpd xmm6, xmm6
unpcklpd xmm2, xmm0
xorps xmm0, xmm0
; 48 : __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49 :
; 50 : __m128d v_pB0 = _mm_load_pd(pB);
; 51 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
mulpd xmm4, xmm6
; 52 : __m128d v_pC0 = _mm_load_pd(pC);
; 53 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
mulpd xmm1, xmm7
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
mulpd xmm3, xmm6
; 58 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm7
; 59 :
; 60 : __m128d v_phaseAcc1;
; 61 : __m128d v_phaseAcc2;
; 62 : __m128d v_phase = _mm_set1_pd(phase);
unpcklpd xmm5, xmm5
npad 13
$LL9@ProcessOpt:
; 63 :
; 64 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65 : // some other code (that will use phase, like sin(phase))
; 66 :
; 67 : v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);
addpd xmm1, xmm4
; 68 : v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69 : v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70 : v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71 : v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72 : v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73 : v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74 : v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75 :
; 76 : v_pB0 = _mm_load_pd(pB + 2);
movups xmm4, XMMWORD PTR [eax-2040]
addpd xmm2, xmm3
; 77 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78 : v_pC0 = _mm_load_pd(pC + 2);
; 79 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80 :
; 81 : v_pB1 = _mm_loadu_pd(pB + 1);
movups xmm3, XMMWORD PTR [eax-2048]
maxpd xmm1, xmm0
maxpd xmm2, xmm0
minpd xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
minpd xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
addpd xmm5, xmm1
movups xmm1, XMMWORD PTR [eax+8]
addpd xmm5, xmm2
; 82 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83 : v_pC1 = _mm_loadu_pd(pC + 1);
movups xmm2, XMMWORD PTR [eax]
add eax, 16 ; 00000010H
movaps XMMWORD PTR _v_phase$31[ebp], xmm5
mulpd xmm4, xmm6
mulpd xmm1, xmm7
mulpd xmm3, xmm6
; 84 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
mulpd xmm2, xmm7
sub ecx, 1
jne SHORT $LL9@ProcessOpt
; 85 : }
; 86 :
; 87 : mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];
movsd xmm0, QWORD PTR _v_phase$31[ebp+8]
movsd QWORD PTR [edx+16], xmm0
; 123 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 124 : mParam1.ProcessOptimized(voiceIndex, blockSize);
; 125 : }
; 126 : }
mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret 4
?ProcessOptimized@MyPlugin@@QAEXH@Z ENDP ; MyPlugin::ProcessOptimized
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT _main
_TEXT SEGMENT
_counterProcessing$1$ = -4304 ; size = 4
_counterProcessing$ = -4304 ; size = 8
_bp0$1$ = -4296 ; size = 8
_v_radiansPerSample$1$ = -4288 ; size = 16
$T3 = -4264 ; size = 8
_v_phase$38 = -4256 ; size = 16
$T4 = -4256 ; size = 8
$T2 = -4232 ; size = 8
tv1040 = -4224 ; size = 16
tv1039 = -4208 ; size = 16
_myPlugin$ = -4192 ; size = 4176
__$ArrayPad$ = -4 ; size = 4
_main PROC ; COMDAT
; 129 : int main() {
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
mov eax, 4312 ; 000010d8H
call __chkstk
mov eax, DWORD PTR ___security_cookie
xor eax, esp
mov DWORD PTR __$ArrayPad$[esp+4312], eax
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd xmm0, QWORD PTR __real@4024000000000000
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR _myPlugin$[esp+4392]
movsd xmm1, QWORD PTR __real@406fe00000000000
xorps xmm2, xmm2
; 16 : alignas(16) double mNoteFrequency = 10.0;
movsd QWORD PTR _myPlugin$[esp+4344], xmm0
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
xor eax, eax
; 17 : alignas(16) double mHostPitch = 1.0;
movsd xmm0, QWORD PTR __real@3ff0000000000000
; 129 : int main() {
push esi
push edi
; 14 : alignas(16) double mPhase = 0.0;
movsd QWORD PTR _myPlugin$[esp+4320], xmm2
; 15 : alignas(16) double mPhaseOptimized = 0.0;
movsd QWORD PTR _myPlugin$[esp+4336], xmm2
; 17 : alignas(16) double mHostPitch = 1.0;
movsd QWORD PTR _myPlugin$[esp+4368], xmm0
; 18 : alignas(16) double mRadiansPerSample = 1.0;
movsd QWORD PTR _myPlugin$[esp+4384], xmm0
$LL11@main:
movd xmm0, eax
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea ecx, DWORD PTR [ecx+8]
; 100 : double value = (sampleIndex / ((double)bufferSize - 1));
cvtdq2pd xmm0, xmm0
inc eax
divsd xmm0, xmm1
; 101 :
; 102 : mParam1.b[voiceIndex][sampleIndex] = value;
movsd QWORD PTR [ecx-8], xmm0
cmp eax, 256 ; 00000100H
jl SHORT $LL11@main
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm6, QWORD PTR __real@400921fb54442d18
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
lea edi, DWORD PTR _myPlugin$[esp+6448]
mov ecx, 512 ; 00000200H
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T2[esp+4320], xmm6
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
xor eax, eax
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd QWORD PTR $T3[esp+4320], xmm2
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
rep stosd
movsd xmm3, QWORD PTR _myPlugin$[esp+4352]
xorps xmm0, xmm0
mulsd xmm3, QWORD PTR _myPlugin$[esp+4368]
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movaps xmm4, xmm2
movsd xmm1, QWORD PTR _myPlugin$[esp+4384]
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
movsd xmm5, QWORD PTR _myPlugin$[esp+4336]
; 130 : MyPlugin myPlugin;
; 131 :
; 132 : long long numProcessing = 5;
; 133 : long long counterProcessing = 0;
movlpd QWORD PTR _counterProcessing$[esp+4320], xmm0
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);
movsd xmm0, QWORD PTR _myPlugin$[esp+4400]
movaps xmm7, xmm3
mulsd xmm7, QWORD PTR _myPlugin$[esp+4384]
; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);
mov edi, DWORD PTR _counterProcessing$[esp+4324]
mov esi, DWORD PTR _counterProcessing$[esp+4320]
unpcklpd xmm4, xmm0
movsd xmm0, QWORD PTR _myPlugin$[esp+6448]
movups XMMWORD PTR tv1040[esp+4320], xmm4
movaps xmm4, xmm2
unpcklpd xmm1, xmm1
unpcklpd xmm4, xmm0
movups XMMWORD PTR tv1039[esp+4320], xmm4
movsd xmm4, QWORD PTR _myPlugin$[esp+4320]
movsd QWORD PTR _bp0$1$[esp+4320], xmm3
unpcklpd xmm7, xmm7
movaps XMMWORD PTR _v_radiansPerSample$1$[esp+4320], xmm1
npad 8
$LL2@main:
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
add esi, 1
; 26 : double *pB = b[voiceIndex];
lea ecx, DWORD PTR _myPlugin$[esp+6448]
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
mov DWORD PTR _counterProcessing$1$[esp+4320], esi
; 26 : double *pB = b[voiceIndex];
mov edx, 256 ; 00000100H
; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {
adc edi, 0
npad 10
$LL29@main:
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
movsd xmm0, QWORD PTR [ecx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [ecx]
mulsd xmm0, QWORD PTR _myPlugin$[esp+4384]
comisd xmm0, xmm6
movsd QWORD PTR $T4[esp+4320], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
jbe SHORT $LN35@main
movaps xmm0, xmm6
jmp SHORT $LN36@main
$LN35@main:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; 287 : return (static_cast<_Ty1&&>(_Left)
comisd xmm2, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)
lea eax, DWORD PTR $T3[esp+4320]
lea esi, DWORD PTR $T4[esp+4320]
cmovbe eax, esi
movsd xmm0, QWORD PTR [eax]
// ...
(注意:我删除了一些行,因为 StackOverflow 对其进行了限制。)
这很不一样。另外,我看到 VS 生成的代码有点多余,即搜索字符串 phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);
:有很多。
我缺少哪些设置?。我在 X86 构建上匹配了相同的 MSVC 版本 (19.15),同时也放置了我拥有的实际优化。
最佳答案
您似乎没有使用相同的编译器标志。来自 Visual Studio 的程序集转储显示每个函数都使用标志 /Ogtp
进行了优化,当您在命令行中指定 /Og
时,这些标志会在内部使用。另一方面,在 Godbolt 版本中,您使用了 /Ot/O2
,它在内部对应于 /Ogtpy
。如果我手动添加 /Oy
标志,代码会略有不同,但仍然与 Visual Studio 生成的代码不同。
我意识到编译器版本并不完全相同,但 19.15.26726.0 和 19.15.26732.1 之间的差异非常小,可能仅包括错误修复。我认为还有其他不同的标志。您可以转到项目的属性页,并在“所有选项”和“其他选项” Pane 中找到已使用的所有编译器选项。在 Release 版本中,除了 /arch:SSE2/Ot/O2
之外,还使用了许多选项。注意 /arch:SSE2
is the default ,因此您不必明确指定它。另外,/O2
implies /Ot
.所以 /arch:SSE2/Ot/O2
等同于 /O2
。
关于c++ - 为什么 godbolt 生成的 asm 输出与我在 Visual Studio 中的实际 asm 代码不同?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54035228/
#include using namespace std; class C{ private: int value; public: C(){ value = 0;
这个问题已经有答案了: What is the difference between char a[] = ?string?; and char *p = ?string?;? (8 个回答) 已关闭
关闭。此题需要details or clarity 。目前不接受答案。 想要改进这个问题吗?通过 editing this post 添加详细信息并澄清问题. 已关闭 7 年前。 此帖子已于 8 个月
除了调试之外,是否有任何针对 c、c++ 或 c# 的测试工具,其工作原理类似于将独立函数复制粘贴到某个文本框,然后在其他文本框中输入参数? 最佳答案 也许您会考虑单元测试。我推荐你谷歌测试和谷歌模拟
我想在第二台显示器中移动一个窗口 (HWND)。问题是我尝试了很多方法,例如将分辨率加倍或输入负值,但它永远无法将窗口放在我的第二台显示器上。 关于如何在 C/C++/c# 中执行此操作的任何线索 最
我正在寻找 C/C++/C## 中不同类型 DES 的现有实现。我的运行平台是Windows XP/Vista/7。 我正在尝试编写一个 C# 程序,它将使用 DES 算法进行加密和解密。我需要一些实
很难说出这里要问什么。这个问题模棱两可、含糊不清、不完整、过于宽泛或夸夸其谈,无法以目前的形式得到合理的回答。如需帮助澄清此问题以便重新打开,visit the help center . 关闭 1
有没有办法强制将另一个 窗口置于顶部? 不是应用程序的窗口,而是另一个已经在系统上运行的窗口。 (Windows, C/C++/C#) 最佳答案 SetWindowPos(that_window_ha
假设您可以在 C/C++ 或 Csharp 之间做出选择,并且您打算在 Windows 和 Linux 服务器上运行同一服务器的多个实例,那么构建套接字服务器应用程序的最明智选择是什么? 最佳答案 如
你们能告诉我它们之间的区别吗? 顺便问一下,有什么叫C++库或C库的吗? 最佳答案 C++ 标准库 和 C 标准库 是 C++ 和 C 标准定义的库,提供给 C++ 和 C 程序使用。那是那些词的共同
下面的测试代码,我将输出信息放在注释中。我使用的是 gcc 4.8.5 和 Centos 7.2。 #include #include class C { public:
很难说出这里问的是什么。这个问题是含糊的、模糊的、不完整的、过于宽泛的或修辞性的,无法以目前的形式得到合理的回答。如需帮助澄清此问题以便重新打开它,visit the help center 。 已关
我的客户将使用名为 annoucement 的结构/类与客户通信。我想我会用 C++ 编写服务器。会有很多不同的类继承annoucement。我的问题是通过网络将这些类发送给客户端 我想也许我应该使用
我在 C# 中有以下函数: public Matrix ConcatDescriptors(IList> descriptors) { int cols = descriptors[0].Co
我有一个项目要编写一个函数来对某些数据执行某些操作。我可以用 C/C++ 编写代码,但我不想与雇主共享该函数的代码。相反,我只想让他有权在他自己的代码中调用该函数。是否可以?我想到了这两种方法 - 在
我使用的是编写糟糕的第 3 方 (C/C++) Api。我从托管代码(C++/CLI)中使用它。有时会出现“访问冲突错误”。这使整个应用程序崩溃。我知道我无法处理这些错误[如果指针访问非法内存位置等,
关闭。这个问题不符合Stack Overflow guidelines .它目前不接受答案。 我们不允许提问寻求书籍、工具、软件库等的推荐。您可以编辑问题,以便用事实和引用来回答。 关闭 7 年前。
已关闭。此问题不符合Stack Overflow guidelines 。目前不接受答案。 要求我们推荐或查找工具、库或最喜欢的场外资源的问题对于 Stack Overflow 来说是偏离主题的,因为
我有一些 C 代码,将使用 P/Invoke 从 C# 调用。我正在尝试为这个 C 函数定义一个 C# 等效项。 SomeData* DoSomething(); struct SomeData {
这个问题已经有答案了: Why are these constructs using pre and post-increment undefined behavior? (14 个回答) 已关闭 6
我是一名优秀的程序员,十分优秀!