gpt4 book ai didi

c++ - 纯 C++ 代码比内联汇编程序快 10 倍。为什么?

转载 作者:IT老高 更新时间:2023-10-28 23:11:50 27 4
gpt4 key购买 nike

这两个代码片段做同样的事情:将两个 float 组相加并将结果存储回其中。

内联汇编器:

void vecAdd_SSE(float* v1, float* v2) { 
_asm {
mov esi, v1
mov edi, v2
movups xmm0, [esi]
movups xmm1, [edi]
addps xmm0, xmm1
movups [esi], xmm0
movups [edi], xmm0
}
}

纯 C++ 代码:

void vecAdd_Std(float* v1, float* v2) {
v1[0] = v1[0]+ v2[0];
v1[1] = v1[1]+ v2[1];
v1[2] = v1[2]+ v2[2];
v1[3] = v1[3]+ v2[3];

v2[0] = v1[0];
v2[1] = v1[1];
v2[2] = v1[2];
v2[3] = v1[3];
}

C++ 代码的反汇编(在 Debug模式下进行的反汇编,因为由于某种原因我无法在 Release模式下查看反汇编):

 void vecAdd_Std(float* v1, float* v2) {
push ebp
mov ebp,esp
sub esp,0C0h
push ebx
push esi
push edi
lea edi,[ebp-0C0h]
mov ecx,30h
mov eax,0CCCCCCCCh
rep stos dword ptr es:[edi]

v1[0] = v1[0]+ v2[0];
mov eax,4
imul ecx,eax,0
mov edx,4
imul eax,edx,0
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+ecx]
addss xmm0,dword ptr [esi+eax]
mov eax,4
imul ecx,eax,0
mov edx,dword ptr [v1]
movss dword ptr [edx+ecx],xmm0
v1[1] = v1[1]+ v2[1];
mov eax,4
shl eax,0
v1[1] = v1[1]+ v2[1];
mov ecx,4
shl ecx,0
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+eax]
addss xmm0,dword ptr [esi+ecx]
mov eax,4
shl eax,0
mov ecx,dword ptr [v1]
movss dword ptr [ecx+eax],xmm0
v1[2] = v1[2]+ v2[2];
mov eax,4
shl eax,1
mov ecx,4
shl ecx,1
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+eax]
addss xmm0,dword ptr [esi+ecx]
mov eax,4
shl eax,1
mov ecx,dword ptr [v1]
movss dword ptr [ecx+eax],xmm0
v1[3] = v1[3]+ v2[3];
mov eax,4
imul ecx,eax,3
mov edx,4
imul eax,edx,3
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+ecx]
addss xmm0,dword ptr [esi+eax]
mov eax,4
imul ecx,eax,3
mov edx,dword ptr [v1]
movss dword ptr [edx+ecx],xmm0

v2[0] = v1[0];
mov eax,4
imul ecx,eax,0
mov edx,4
imul eax,edx,0
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov ecx,dword ptr [esi+ecx]
mov dword ptr [edx+eax],ecx
v2[1] = v1[1];
mov eax,4
shl eax,0
mov ecx,4
shl ecx,0
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov eax,dword ptr [esi+eax]
mov dword ptr [edx+ecx],eax
v2[2] = v1[2];
mov eax,4
shl eax,1
mov ecx,4
shl ecx,1
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov eax,dword ptr [esi+eax]
mov dword ptr [edx+ecx],eax
v2[3] = v1[3];
mov eax,4
imul ecx,eax,3
mov edx,4
imul eax,edx,3
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov ecx,dword ptr [esi+ecx]
mov dword ptr [edx+eax],ecx

}

现在我对这些函数进行了时间测量,发现内联汇编代码需要大约 10 倍的时间(在 Release模式下)。有人知道为什么吗?

最佳答案

在我的机器上(VS2015 64位模式),编译器内联vecAdd_Std并产生

00007FF625921C8F  vmovups     xmm1,xmmword ptr [__xmm@4100000040c000004080000040000000 (07FF625929D60h)]  
00007FF625921C97 vmovups xmm4,xmm1
00007FF625921C9B vcvtss2sd xmm1,xmm1,xmm4

测试代码

int main() {
float x[4] = {1.0, 2.0, 3.0, 4.0};
float y[4] = {1.0, 2.0, 3.0, 4.0};

vecAdd_Std(x, y);

std::cout << x[0];
}

关于c++ - 纯 C++ 代码比内联汇编程序快 10 倍。为什么?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/32385696/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com