gpt4 book ai didi

c++ - 为什么 godbolt 生成的 asm 输出与我在 Visual Studio 中的实际 asm 代码不同?

转载 作者:太空狗 更新时间:2023-10-29 21:32:42 25 4
gpt4 key购买 nike

这是 godbolt 生成的代码.

下面是 Visual Studio 在我的 main.asm 文件上生成的相同代码(通过 Project->C/C++->Output Files->Assembly With Source Code (/FAs) 在 Assembler Output 字段下启用):

; Listing generated by Microsoft (R) Optimizing Compiler Version 19.15.26732.1 

TITLE c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
.686P
.XMM
include listing.inc
.model flat

INCLUDELIB OLDNAMES

EXTRN __imp____std_terminate:PROC
EXTRN @__security_check_cookie@4:PROC
EXTRN __imp____CxxFrameHandler3:PROC
PUBLIC ??$?RABNABN@?$less@X@std@@QBE_NABN0@Z ; std::less<void>::operator()<double const &,double const &>
PUBLIC ??$clamp@NU?$less@X@std@@@std@@YAABNABN00U?$less@X@0@@Z ; std::clamp<double,std::less<void> >
PUBLIC ??$clamp@N@std@@YAABNABN00@Z ; std::clamp<double>
PUBLIC _main
PUBLIC ?ProcessOptimized@MyPlugin@@QAEXH@Z ; MyPlugin::ProcessOptimized
PUBLIC ?Process@MyPlugin@@QAEXH@Z ; MyPlugin::Process
PUBLIC ??1MyPlugin@@QAE@XZ ; MyPlugin::~MyPlugin
PUBLIC ??0MyPlugin@@QAE@XZ ; MyPlugin::MyPlugin
PUBLIC ?ProcessOptimized@Param@@QAEXHH@Z ; Param::ProcessOptimized
PUBLIC ?Process@Param@@QAEXHH@Z ; Param::Process
PUBLIC ??0Param@@QAE@XZ ; Param::Param
PUBLIC __real@3ff0000000000000
PUBLIC __real@400921fb54442d18
PUBLIC __real@4024000000000000
PUBLIC __real@406fe00000000000
PUBLIC __xmm@00000003000000020000000100000000
PUBLIC __xmm@400921fb54442d18400921fb54442d18
PUBLIC __xmm@406fe00000000000406fe00000000000
EXTRN __chkstk:PROC
EXTRN ___security_cookie:DWORD
EXTRN __fltused:DWORD
; COMDAT __xmm@406fe00000000000406fe00000000000
CONST SEGMENT
__xmm@406fe00000000000406fe00000000000 DB 00H, 00H, 00H, 00H, 00H, 0e0H, 'o'
DB '@', 00H, 00H, 00H, 00H, 00H, 0e0H, 'o@'
CONST ENDS
; COMDAT __xmm@400921fb54442d18400921fb54442d18
CONST SEGMENT
__xmm@400921fb54442d18400921fb54442d18 DB 018H, '-DT', 0fbH, '!', 09H, '@'
DB 018H, '-DT', 0fbH, '!', 09H, '@'
CONST ENDS
; COMDAT __xmm@00000003000000020000000100000000
CONST SEGMENT
__xmm@00000003000000020000000100000000 DB 00H, 00H, 00H, 00H, 01H, 00H, 00H
DB 00H, 02H, 00H, 00H, 00H, 03H, 00H, 00H, 00H
CONST ENDS
; COMDAT __real@406fe00000000000
CONST SEGMENT
__real@406fe00000000000 DQ 0406fe00000000000r ; 255
CONST ENDS
; COMDAT __real@4024000000000000
CONST SEGMENT
__real@4024000000000000 DQ 04024000000000000r ; 10
CONST ENDS
; COMDAT __real@400921fb54442d18
CONST SEGMENT
__real@400921fb54442d18 DQ 0400921fb54442d18r ; 3.14159
CONST ENDS
; COMDAT __real@3ff0000000000000
CONST SEGMENT
__real@3ff0000000000000 DQ 03ff0000000000000r ; 1
CONST ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??0Param@@QAE@XZ
_TEXT SEGMENT
??0Param@@QAE@XZ PROC ; Param::Param, COMDAT
; _this$ = ecx

; 23 : Param() { }

xorps xmm0, xmm0
mov eax, ecx
movsd QWORD PTR [ecx], xmm0
movsd QWORD PTR [ecx+16], xmm0
movsd xmm0, QWORD PTR __real@4024000000000000
movsd QWORD PTR [ecx+32], xmm0
movsd xmm0, QWORD PTR __real@3ff0000000000000
movsd QWORD PTR [ecx+48], xmm0
movsd QWORD PTR [ecx+64], xmm0
ret 0
??0Param@@QAE@XZ ENDP ; Param::Param
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?Process@Param@@QAEXHH@Z
_TEXT SEGMENT
$T1 = -24 ; size = 8
$T3 = -16 ; size = 8
$T2 = -8 ; size = 8
_voiceIndex$ = 8 ; size = 4
_blockSize$dead$ = 12 ; size = 4
?Process@Param@@QAEXHH@Z PROC ; Param::Process, COMDAT
; _this$ = ecx

; 25 : inline void Process(int voiceIndex, int blockSize) {

push ebp
mov ebp, esp
sub esp, 24 ; 00000018H

; 26 : double *pB = b[voiceIndex];

mov eax, DWORD PTR _voiceIndex$[ebp]
xorps xmm5, xmm5

; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

movsd xmm2, QWORD PTR __real@400921fb54442d18
push esi
mov esi, ecx
shl eax, 11 ; 0000000bH
push edi
movsd QWORD PTR $T1[ebp], xmm2
mov ecx, 256 ; 00000100H
movsd QWORD PTR $T2[ebp], xmm5
movsd xmm3, QWORD PTR [esi+48]
lea edx, DWORD PTR [esi+2128]
movsd xmm1, QWORD PTR [esi]
add edx, eax
mulsd xmm3, QWORD PTR [esi+32]
movsd xmm4, QWORD PTR [esi+64]
npad 11
$LL4@Process:
movsd xmm0, QWORD PTR [edx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [edx]
mulsd xmm0, xmm4
comisd xmm0, xmm2
movsd QWORD PTR $T3[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

jbe SHORT $LN10@Process
movaps xmm0, xmm2
jmp SHORT $LN11@Process
$LN10@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287 : return (static_cast<_Ty1&&>(_Left)

comisd xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

lea eax, DWORD PTR $T2[ebp]
lea edi, DWORD PTR $T3[ebp]
cmovbe eax, edi
movsd xmm0, QWORD PTR [eax]
$LN11@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp

; 31 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {

add edx, 8

; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

addsd xmm1, xmm0
sub ecx, 1
jne SHORT $LL4@Process

; 35 : }
; 36 :
; 37 : mPhase = phase;
; 38 : }

pop edi
movsd QWORD PTR [esi], xmm1
pop esi
mov esp, ebp
pop ebp
ret 8
?Process@Param@@QAEXHH@Z ENDP ; Param::Process
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?ProcessOptimized@Param@@QAEXHH@Z
_TEXT SEGMENT
_v_phase$ = -16 ; size = 16
_voiceIndex$ = 8 ; size = 4
_blockSize$dead$ = 12 ; size = 4
?ProcessOptimized@Param@@QAEXHH@Z PROC ; Param::ProcessOptimized, COMDAT
; _this$ = ecx

; 39 : inline void ProcessOptimized(int voiceIndex, int blockSize) {

push ebx
mov ebx, esp
sub esp, 8
and esp, -16 ; fffffff0H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov DWORD PTR [esp+4], ebp
mov ebp, esp

; 40 : double *pB = b[voiceIndex];

mov eax, DWORD PTR _voiceIndex$[ebx]
mov edx, ecx
shl eax, 11 ; 0000000bH
xorps xmm3, xmm3
xorps xmm2, xmm2
sub esp, 16 ; 00000010H
xorps xmm7, xmm7
mov ecx, 128 ; 00000080H

; 41 : double *pC = c[voiceIndex];
; 42 : double phase = mPhaseOptimized;
; 43 : double bp0 = mNoteFrequency * mHostPitch;

movsd xmm5, QWORD PTR [edx+48]
mulsd xmm5, QWORD PTR [edx+32]

; 44 :
; 45 : __m128d v_boundLower = _mm_set1_pd(0.0);
; 46 : __m128d v_boundUpper = _mm_set1_pd(PI);
; 47 : __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);

movsd xmm6, QWORD PTR [edx+64]

; 48 : __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49 :
; 50 : __m128d v_pB0 = _mm_load_pd(pB);
; 51 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 52 : __m128d v_pC0 = _mm_load_pd(pC);
; 53 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 54 :
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

movsd xmm0, QWORD PTR [eax+edx+80]
movups xmm4, XMMWORD PTR [eax+edx+80]
movups xmm1, XMMWORD PTR [eax+edx+2128]
mulsd xmm5, xmm6
unpcklpd xmm3, xmm0

; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

movsd xmm0, QWORD PTR [eax+edx+2128]
add eax, 2136 ; 00000858H
unpcklpd xmm2, xmm0
add eax, edx

; 58 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);
; 59 :
; 60 : __m128d v_phaseAcc1;
; 61 : __m128d v_phaseAcc2;
; 62 : __m128d v_phase = _mm_set1_pd(phase);

movsd xmm0, QWORD PTR [edx+16]
unpcklpd xmm5, xmm5
unpcklpd xmm6, xmm6
mulpd xmm4, xmm5
mulpd xmm1, xmm6
mulpd xmm3, xmm5
mulpd xmm2, xmm6
unpcklpd xmm0, xmm0
npad 2
$LL4@ProcessOpt:

; 63 :
; 64 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65 : // some other code (that will use phase, like sin(phase))
; 66 :
; 67 : v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);

addpd xmm1, xmm4

; 68 : v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69 : v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70 : v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71 : v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72 : v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73 : v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74 : v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75 :
; 76 : v_pB0 = _mm_load_pd(pB + 2);

movups xmm4, XMMWORD PTR [eax-2040]
addpd xmm2, xmm3

; 77 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78 : v_pC0 = _mm_load_pd(pC + 2);
; 79 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80 :
; 81 : v_pB1 = _mm_loadu_pd(pB + 1);

movups xmm3, XMMWORD PTR [eax-2048]
maxpd xmm1, xmm7
maxpd xmm2, xmm7
minpd xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
minpd xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
addpd xmm0, xmm1
movups xmm1, XMMWORD PTR [eax+8]
addpd xmm0, xmm2

; 82 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83 : v_pC1 = _mm_loadu_pd(pC + 1);

movups xmm2, XMMWORD PTR [eax]
add eax, 16 ; 00000010H
movaps XMMWORD PTR _v_phase$[ebp], xmm0
mulpd xmm4, xmm5
mulpd xmm1, xmm6
mulpd xmm3, xmm5

; 84 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

mulpd xmm2, xmm6
sub ecx, 1
jne SHORT $LL4@ProcessOpt

; 85 : }
; 86 :
; 87 : mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

movsd xmm0, QWORD PTR _v_phase$[ebp+8]
movsd QWORD PTR [edx+16], xmm0

; 88 : }

mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret 8
?ProcessOptimized@Param@@QAEXHH@Z ENDP ; Param::ProcessOptimized
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??0MyPlugin@@QAE@XZ
_TEXT SEGMENT
??0MyPlugin@@QAE@XZ PROC ; MyPlugin::MyPlugin, COMDAT
; _this$ = ecx

; 97 : // fill b
; 98 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

movaps xmm2, XMMWORD PTR __xmm@00000003000000020000000100000000
xorps xmm0, xmm0
movaps xmm3, XMMWORD PTR __xmm@406fe00000000000406fe00000000000
xor edx, edx
push esi
mov esi, ecx
push edi

; 14 : alignas(16) double mPhase = 0.0;

movsd QWORD PTR [esi], xmm0

; 97 : // fill b
; 98 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

lea ecx, DWORD PTR [esi+88]

; 15 : alignas(16) double mPhaseOptimized = 0.0;

movsd QWORD PTR [esi+16], xmm0

; 16 : alignas(16) double mNoteFrequency = 10.0;

movsd xmm0, QWORD PTR __real@4024000000000000
movsd QWORD PTR [esi+32], xmm0

; 17 : alignas(16) double mHostPitch = 1.0;

movsd xmm0, QWORD PTR __real@3ff0000000000000
movsd QWORD PTR [esi+48], xmm0

; 18 : alignas(16) double mRadiansPerSample = 1.0;

movsd QWORD PTR [esi+64], xmm0
$LL7@MyPlugin:

; 100 : double value = (sampleIndex / ((double)bufferSize - 1));

movd xmm0, edx
lea eax, DWORD PTR [edx+2]
pshufd xmm1, xmm0, 0
lea ecx, DWORD PTR [ecx+32]
movq xmm0, xmm2
add edx, 4
paddd xmm1, xmm0
cvtdq2pd xmm0, xmm1
divpd xmm0, xmm3

; 101 :
; 102 : mParam1.b[voiceIndex][sampleIndex] = value;

movlpd QWORD PTR [ecx-40], xmm0
movhpd QWORD PTR [ecx-32], xmm0
movd xmm0, eax
pshufd xmm1, xmm0, 0
movq xmm0, xmm2
paddd xmm1, xmm0
cvtdq2pd xmm0, xmm1
divpd xmm0, xmm3
movlpd QWORD PTR [ecx-24], xmm0
movhpd QWORD PTR [ecx-16], xmm0
cmp edx, 256 ; 00000100H
jl SHORT $LL7@MyPlugin

; 103 : }
; 104 : }
; 105 :
; 106 : // fill c
; 107 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

lea edi, DWORD PTR [esi+2128]
xor eax, eax
mov ecx, 512 ; 00000200H
rep stosd

; 109 : double value = 0.0;
; 110 :
; 111 : mParam1.c[voiceIndex][sampleIndex] = value;
; 112 : }
; 113 : }
; 114 : }

pop edi
mov eax, esi
pop esi
ret 0
??0MyPlugin@@QAE@XZ ENDP ; MyPlugin::MyPlugin
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ??1MyPlugin@@QAE@XZ
_TEXT SEGMENT
??1MyPlugin@@QAE@XZ PROC ; MyPlugin::~MyPlugin, COMDAT
; _this$dead$ = ecx

; 115 : ~MyPlugin() { }

ret 0
??1MyPlugin@@QAE@XZ ENDP ; MyPlugin::~MyPlugin
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?Process@MyPlugin@@QAEXH@Z
_TEXT SEGMENT
$T2 = -28 ; size = 8
$T4 = -20 ; size = 8
$T3 = -12 ; size = 8
_blockSize$dead$ = 8 ; size = 4
?Process@MyPlugin@@QAEXH@Z PROC ; MyPlugin::Process, COMDAT
; _this$ = ecx

; 117 : void Process(int blockSize) {

push ebp
mov ebp, esp
sub esp, 28 ; 0000001cH

; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

movsd xmm2, QWORD PTR __real@400921fb54442d18
xorps xmm5, xmm5

; 117 : void Process(int blockSize) {

push esi
mov esi, ecx

; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

movsd QWORD PTR $T2[ebp], xmm2

; 117 : void Process(int blockSize) {

push edi

; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

movsd QWORD PTR $T3[ebp], xmm5
mov edx, 256 ; 00000100H
movsd xmm3, QWORD PTR [esi+48]

; 27 : double *pC = c[voiceIndex];

lea ecx, DWORD PTR [esi+2128]

; 28 : double phase = mPhase;
; 29 : double bp0 = mNoteFrequency * mHostPitch;

movsd xmm1, QWORD PTR [esi]
mulsd xmm3, QWORD PTR [esi+32]
movsd xmm4, QWORD PTR [esi+64]
npad 3
$LL9@Process:

; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

movsd xmm0, QWORD PTR [ecx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [ecx]
mulsd xmm0, xmm4
comisd xmm0, xmm2
movsd QWORD PTR $T4[ebp], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

jbe SHORT $LN15@Process
movaps xmm0, xmm2
jmp SHORT $LN16@Process
$LN15@Process:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287 : return (static_cast<_Ty1&&>(_Left)

comisd xmm5, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

lea eax, DWORD PTR $T3[ebp]
lea edi, DWORD PTR $T4[ebp]
cmovbe eax, edi
movsd xmm0, QWORD PTR [eax]
$LN16@Process:
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp

; 31 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {

add ecx, 8

; 32 : // some other code (that will use phase, like sin(phase))
; 33 :
; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

addsd xmm1, xmm0
sub edx, 1
jne SHORT $LL9@Process

; 118 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119 : mParam1.Process(voiceIndex, blockSize);
; 120 : }
; 121 : }

pop edi

; 37 : mPhase = phase;

movsd QWORD PTR [esi], xmm1

; 118 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 119 : mParam1.Process(voiceIndex, blockSize);
; 120 : }
; 121 : }

pop esi
mov esp, ebp
pop ebp
ret 4
?Process@MyPlugin@@QAEXH@Z ENDP ; MyPlugin::Process
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT ?ProcessOptimized@MyPlugin@@QAEXH@Z
_TEXT SEGMENT
_v_phase$31 = -16 ; size = 16
_blockSize$dead$ = 8 ; size = 4
?ProcessOptimized@MyPlugin@@QAEXH@Z PROC ; MyPlugin::ProcessOptimized, COMDAT
; _this$ = ecx

; 122 : void ProcessOptimized(int blockSize) {

push ebx
mov ebx, esp
sub esp, 8
and esp, -16 ; fffffff0H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov DWORD PTR [esp+4], ebp
mov ebp, esp
mov edx, ecx
xorps xmm3, xmm3
xorps xmm2, xmm2
sub esp, 16 ; 00000010H

; 40 : double *pB = b[voiceIndex];

mov ecx, 128 ; 00000080H
movsd xmm6, QWORD PTR [edx+48]
lea eax, DWORD PTR [edx+2136]
mulsd xmm6, QWORD PTR [edx+32]

; 41 : double *pC = c[voiceIndex];
; 42 : double phase = mPhaseOptimized;
; 43 : double bp0 = mNoteFrequency * mHostPitch;
; 44 :
; 45 : __m128d v_boundLower = _mm_set1_pd(0.0);
; 46 : __m128d v_boundUpper = _mm_set1_pd(PI);
; 47 : __m128d v_radiansPerSampleBp0 = _mm_set1_pd(mRadiansPerSample * bp0);

movsd xmm7, QWORD PTR [edx+64]

; 54 :
; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

movsd xmm0, QWORD PTR [edx+80]
movsd xmm5, QWORD PTR [edx+16]
movups xmm4, XMMWORD PTR [edx+80]
movups xmm1, XMMWORD PTR [edx+2128]
mulsd xmm6, xmm7
unpcklpd xmm3, xmm0

; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

movsd xmm0, QWORD PTR [edx+2128]
unpcklpd xmm7, xmm7
unpcklpd xmm6, xmm6
unpcklpd xmm2, xmm0
xorps xmm0, xmm0

; 48 : __m128d v_radiansPerSample = _mm_set1_pd(mRadiansPerSample);
; 49 :
; 50 : __m128d v_pB0 = _mm_load_pd(pB);
; 51 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);

mulpd xmm4, xmm6

; 52 : __m128d v_pC0 = _mm_load_pd(pC);
; 53 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);

mulpd xmm1, xmm7

; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);

mulpd xmm3, xmm6

; 58 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

mulpd xmm2, xmm7

; 59 :
; 60 : __m128d v_phaseAcc1;
; 61 : __m128d v_phaseAcc2;
; 62 : __m128d v_phase = _mm_set1_pd(phase);

unpcklpd xmm5, xmm5
npad 13
$LL9@ProcessOpt:

; 63 :
; 64 : for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += 2, pB += 2, pC += 2) {
; 65 : // some other code (that will use phase, like sin(phase))
; 66 :
; 67 : v_phaseAcc1 = _mm_add_pd(v_pB0, v_pC0);

addpd xmm1, xmm4

; 68 : v_phaseAcc1 = _mm_max_pd(v_phaseAcc1, v_boundLower);
; 69 : v_phaseAcc1 = _mm_min_pd(v_phaseAcc1, v_boundUpper);
; 70 : v_phaseAcc2 = _mm_add_pd(v_pB1, v_pC1);
; 71 : v_phaseAcc2 = _mm_max_pd(v_phaseAcc2, v_boundLower);
; 72 : v_phaseAcc2 = _mm_min_pd(v_phaseAcc2, v_boundUpper);
; 73 : v_phase = _mm_add_pd(v_phase, v_phaseAcc1);
; 74 : v_phase = _mm_add_pd(v_phase, v_phaseAcc2);
; 75 :
; 76 : v_pB0 = _mm_load_pd(pB + 2);

movups xmm4, XMMWORD PTR [eax-2040]
addpd xmm2, xmm3

; 77 : v_pB0 = _mm_mul_pd(v_pB0, v_radiansPerSampleBp0);
; 78 : v_pC0 = _mm_load_pd(pC + 2);
; 79 : v_pC0 = _mm_mul_pd(v_pC0, v_radiansPerSample);
; 80 :
; 81 : v_pB1 = _mm_loadu_pd(pB + 1);

movups xmm3, XMMWORD PTR [eax-2048]
maxpd xmm1, xmm0
maxpd xmm2, xmm0
minpd xmm1, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
minpd xmm2, XMMWORD PTR __xmm@400921fb54442d18400921fb54442d18
addpd xmm5, xmm1
movups xmm1, XMMWORD PTR [eax+8]
addpd xmm5, xmm2

; 82 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 83 : v_pC1 = _mm_loadu_pd(pC + 1);

movups xmm2, XMMWORD PTR [eax]
add eax, 16 ; 00000010H
movaps XMMWORD PTR _v_phase$31[ebp], xmm5
mulpd xmm4, xmm6
mulpd xmm1, xmm7
mulpd xmm3, xmm6

; 84 : v_pC1 = _mm_mul_pd(v_pC1, v_radiansPerSample);

mulpd xmm2, xmm7
sub ecx, 1
jne SHORT $LL9@ProcessOpt

; 85 : }
; 86 :
; 87 : mPhaseOptimized = v_phase.m128d_f64[blockSize % 2 == 0 ? 1 : 0];

movsd xmm0, QWORD PTR _v_phase$31[ebp+8]
movsd QWORD PTR [edx+16], xmm0

; 123 : for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
; 124 : mParam1.ProcessOptimized(voiceIndex, blockSize);
; 125 : }
; 126 : }

mov esp, ebp
pop ebp
mov esp, ebx
pop ebx
ret 4
?ProcessOptimized@MyPlugin@@QAEXH@Z ENDP ; MyPlugin::ProcessOptimized
_TEXT ENDS
; Function compile flags: /Ogtp
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm
; File c:\repos\analysis_vectorization\analysis_vectorization\main.cpp
; COMDAT _main
_TEXT SEGMENT
_counterProcessing$1$ = -4304 ; size = 4
_counterProcessing$ = -4304 ; size = 8
_bp0$1$ = -4296 ; size = 8
_v_radiansPerSample$1$ = -4288 ; size = 16
$T3 = -4264 ; size = 8
_v_phase$38 = -4256 ; size = 16
$T4 = -4256 ; size = 8
$T2 = -4232 ; size = 8
tv1040 = -4224 ; size = 16
tv1039 = -4208 ; size = 16
_myPlugin$ = -4192 ; size = 4176
__$ArrayPad$ = -4 ; size = 4
_main PROC ; COMDAT

; 129 : int main() {

push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
mov eax, 4312 ; 000010d8H
call __chkstk
mov eax, DWORD PTR ___security_cookie
xor eax, esp
mov DWORD PTR __$ArrayPad$[esp+4312], eax

; 16 : alignas(16) double mNoteFrequency = 10.0;

movsd xmm0, QWORD PTR __real@4024000000000000

; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

lea ecx, DWORD PTR _myPlugin$[esp+4392]
movsd xmm1, QWORD PTR __real@406fe00000000000
xorps xmm2, xmm2

; 16 : alignas(16) double mNoteFrequency = 10.0;

movsd QWORD PTR _myPlugin$[esp+4344], xmm0

; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

xor eax, eax

; 17 : alignas(16) double mHostPitch = 1.0;

movsd xmm0, QWORD PTR __real@3ff0000000000000

; 129 : int main() {

push esi
push edi

; 14 : alignas(16) double mPhase = 0.0;

movsd QWORD PTR _myPlugin$[esp+4320], xmm2

; 15 : alignas(16) double mPhaseOptimized = 0.0;

movsd QWORD PTR _myPlugin$[esp+4336], xmm2

; 17 : alignas(16) double mHostPitch = 1.0;

movsd QWORD PTR _myPlugin$[esp+4368], xmm0

; 18 : alignas(16) double mRadiansPerSample = 1.0;

movsd QWORD PTR _myPlugin$[esp+4384], xmm0
$LL11@main:
movd xmm0, eax

; 99 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

lea ecx, DWORD PTR [ecx+8]

; 100 : double value = (sampleIndex / ((double)bufferSize - 1));

cvtdq2pd xmm0, xmm0
inc eax
divsd xmm0, xmm1

; 101 :
; 102 : mParam1.b[voiceIndex][sampleIndex] = value;

movsd QWORD PTR [ecx-8], xmm0
cmp eax, 256 ; 00000100H
jl SHORT $LL11@main

; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

movsd xmm6, QWORD PTR __real@400921fb54442d18

; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

lea edi, DWORD PTR _myPlugin$[esp+6448]
mov ecx, 512 ; 00000200H

; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

movsd QWORD PTR $T2[esp+4320], xmm6

; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

xor eax, eax

; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

movsd QWORD PTR $T3[esp+4320], xmm2

; 108 : for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {

rep stosd
movsd xmm3, QWORD PTR _myPlugin$[esp+4352]
xorps xmm0, xmm0
mulsd xmm3, QWORD PTR _myPlugin$[esp+4368]

; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

movaps xmm4, xmm2
movsd xmm1, QWORD PTR _myPlugin$[esp+4384]

; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

movsd xmm5, QWORD PTR _myPlugin$[esp+4336]

; 130 : MyPlugin myPlugin;
; 131 :
; 132 : long long numProcessing = 5;
; 133 : long long counterProcessing = 0;

movlpd QWORD PTR _counterProcessing$[esp+4320], xmm0

; 55 : __m128d v_pB1 = _mm_setr_pd(0.0, pB[0]);

movsd xmm0, QWORD PTR _myPlugin$[esp+4400]
movaps xmm7, xmm3
mulsd xmm7, QWORD PTR _myPlugin$[esp+4384]

; 56 : v_pB1 = _mm_mul_pd(v_pB1, v_radiansPerSampleBp0);
; 57 : __m128d v_pC1 = _mm_setr_pd(0.0, pC[0]);

mov edi, DWORD PTR _counterProcessing$[esp+4324]
mov esi, DWORD PTR _counterProcessing$[esp+4320]
unpcklpd xmm4, xmm0
movsd xmm0, QWORD PTR _myPlugin$[esp+6448]
movups XMMWORD PTR tv1040[esp+4320], xmm4
movaps xmm4, xmm2
unpcklpd xmm1, xmm1
unpcklpd xmm4, xmm0
movups XMMWORD PTR tv1039[esp+4320], xmm4
movsd xmm4, QWORD PTR _myPlugin$[esp+4320]
movsd QWORD PTR _bp0$1$[esp+4320], xmm3
unpcklpd xmm7, xmm7
movaps XMMWORD PTR _v_radiansPerSample$1$[esp+4320], xmm1
npad 8
$LL2@main:

; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {

add esi, 1

; 26 : double *pB = b[voiceIndex];

lea ecx, DWORD PTR _myPlugin$[esp+6448]

; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {

mov DWORD PTR _counterProcessing$1$[esp+4320], esi

; 26 : double *pB = b[voiceIndex];

mov edx, 256 ; 00000100H

; 134 :
; 135 : // I'll only process once block, just for analysis
; 136 : while (counterProcessing++ < numProcessing) {

adc edi, 0
npad 10
$LL29@main:

; 34 : phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);

movsd xmm0, QWORD PTR [ecx-2048]
mulsd xmm0, xmm3
addsd xmm0, QWORD PTR [ecx]
mulsd xmm0, QWORD PTR _myPlugin$[esp+4384]
comisd xmm0, xmm6
movsd QWORD PTR $T4[esp+4320], xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

jbe SHORT $LN35@main
movaps xmm0, xmm6
jmp SHORT $LN36@main
$LN35@main:
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\xstddef

; 287 : return (static_cast<_Ty1&&>(_Left)

comisd xmm2, xmm0
; File c:\program files (x86)\microsoft visual studio\2017\community\vc\tools\msvc\14.15.26726\include\algorithm

; 5669 : return (_DEBUG_LT_PRED(_Pred, _Max_val, _Val)

lea eax, DWORD PTR $T3[esp+4320]
lea esi, DWORD PTR $T4[esp+4320]
cmovbe eax, esi
movsd xmm0, QWORD PTR [eax]

// ...

(注意:我删除了一些行,因为 StackOverflow 对其进行了限制。)

这很不一样。另外,我看到 VS 生成的代码有点多余,即搜索字符串 phase += std::clamp(mRadiansPerSample * (bp0 * pB[sampleIndex] + pC[sampleIndex]), 0.0, PI);:有很多。

我缺少哪些设置?。我在 X86 构建上匹配了相同的 MSVC 版本 (19.15),同时也放置了我拥有的实际优化。

最佳答案

您似乎没有使用相同的编译器标志。来自 Visual Studio 的程序集转储显示每个函数都使用标志 /Ogtp 进行了优化,当您在命令行中指定 /Og 时,这些标志会在内部使用。另一方面,在 Godbolt 版本中,您使用了 /Ot/O2,它在内部对应于 /Ogtpy。如果我手动添加 /Oy 标志,代码会略有不同,但仍然与 Visual Studio 生成的代码不同。

我意识到编译器版本并不完全相同,但 19.15.26726.0 和 19.15.26732.1 之间的差异非常小,可能仅包括错误修复。我认为还有其他不同的标志。您可以转到项目的属性页,并在“所有选项”和“其他选项” Pane 中找到已使用的所有编译器选项。在 Release 版本中,除了 /arch:SSE2/Ot/O2 之外,还使用了许多选项。注意 /arch:SSE2 is the default ,因此您不必明确指定它。另外,/O2 implies /Ot .所以 /arch:SSE2/Ot/O2 等同于 /O2

关于c++ - 为什么 godbolt 生成的 asm 输出与我在 Visual Studio 中的实际 asm 代码不同?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54035228/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com