gpt4 book ai didi

c++ - SSE Intrinsics 和循环展开

转载 作者:塔克拉玛干 更新时间:2023-11-03 01:35:14 29 4
gpt4 key购买 nike

我正在尝试优化一些循环并且我已经成功了,但我想知道我是否只做了部分正确的事情。比如说我有这个循环:

for(i=0;i<n;i++){
b[i] = a[i]*2;
}

将它展开 3 倍,产生这个:

int unroll = (n/4)*4;
for(i=0;i<unroll;i+=4)
{
b[i] = a[i]*2;
b[i+1] = a[i+1]*2;
b[i+2] = a[i+2]*2;
b[i+3] = a[i+3]*2;
}
for(;i<n;i++)
{
b[i] = a[i]*2;
}

现在是 SSE 翻译等价物:

__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v, two_v);
_mm_storeu_ps(&b[i], ai2_v);

或者是:

__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v, two_v);
_mm_storeu_ps(&b[i], ai2_v);

__m128 ai1_v = _mm_loadu_ps(&a[i+1]);
__m128 two1_v = _mm_set1_ps(2);
__m128 ai_1_2_v = _mm_mul_ps(ai1_v, two1_v);
_mm_storeu_ps(&b[i+1], ai_1_2_v);

__m128 ai2_v = _mm_loadu_ps(&a[i+2]);
__m128 two2_v = _mm_set1_ps(2);
__m128 ai_2_2_v = _mm_mul_ps(ai2_v, two2_v);
_mm_storeu_ps(&b[i+2], ai_2_2_v);

__m128 ai3_v = _mm_loadu_ps(&a[i+3]);
__m128 two3_v = _mm_set1_ps(2);
__m128 ai_3_2_v = _mm_mul_ps(ai3_v, two3_v);
_mm_storeu_ps(&b[i+3], ai_3_2_v);

我对代码部分有点困惑:

for(;i<n;i++)
{
b[i] = a[i]*2;
}

这是做什么的?是否只是做额外的部分,例如,如果循环不能被您选择展开它的因素分割?谢谢。

最佳答案

答案是第一 block :

    __m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v,two_v);
_mm_storeu_ps(&b[i],ai2_v);

它已经一次需要四个变量。

下面是完整的程序,其中的等效代码部分已被注释掉:

#include <iostream>

int main()
{
int i{0};
float a[10] ={1,2,3,4,5,6,7,8,9,10};
float b[10] ={0,0,0,0,0,0,0,0,0,0};

int n = 10;
int unroll = (n/4)*4;
for (i=0; i<unroll; i+=4) {
//b[i] = a[i]*2;
//b[i+1] = a[i+1]*2;
//b[i+2] = a[i+2]*2;
//b[i+3] = a[i+3]*2;
__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v,two_v);
_mm_storeu_ps(&b[i],ai2_v);
}

for (; i<n; i++) {
b[i] = a[i]*2;
}

for (auto i : a) { std::cout << i << "\t"; }
std::cout << "\n";
for (auto i : b) { std::cout << i << "\t"; }
std::cout << "\n";

return 0;
}

至于效率;似乎我系统上的程序集生成了 movups 指令,而手动滚动代码可以使用 movaps ,这应该更快。

我用下面的程序做了一些基准测试:

#include <iostream>
//#define NO_UNROLL
//#define UNROLL
//#define SSE_UNROLL
#define SSE_UNROLL_ALIGNED

int main()
{
const size_t array_size = 100003;
#ifdef SSE_UNROLL_ALIGNED
__declspec(align(16)) int i{0};
__declspec(align(16)) float a[array_size] ={1,2,3,4,5,6,7,8,9,10};
__declspec(align(16)) float b[array_size] ={0,0,0,0,0,0,0,0,0,0};
#endif
#ifndef SSE_UNROLL_ALIGNED
int i{0};
float a[array_size] ={1,2,3,4,5,6,7,8,9,10};
float b[array_size] ={0,0,0,0,0,0,0,0,0,0};
#endif

int n = array_size;
int unroll = (n/4)*4;


for (size_t j{0}; j < 100000; ++j) {
#ifdef NO_UNROLL
for (i=0; i<n; i++) {
b[i] = a[i]*2;
}
#endif
#ifdef UNROLL
for (i=0; i<unroll; i+=4) {
b[i] = a[i]*2;
b[i+1] = a[i+1]*2;
b[i+2] = a[i+2]*2;
b[i+3] = a[i+3]*2;
}
#endif
#ifdef SSE_UNROLL
for (i=0; i<unroll; i+=4) {
__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v,two_v);
_mm_storeu_ps(&b[i],ai2_v);
}
#endif
#ifdef SSE_UNROLL_ALIGNED
for (i=0; i<unroll; i+=4) {
__m128 ai_v = _mm_load_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v,two_v);
_mm_store_ps(&b[i],ai2_v);
}
#endif
#ifndef NO_UNROLL
for (; i<n; i++) {
b[i] = a[i]*2;
}
#endif
}

//for (auto i : a) { std::cout << i << "\t"; }
//std::cout << "\n";
//for (auto i : b) { std::cout << i << "\t"; }
//std::cout << "\n";

return 0;
}

我得到了以下结果 (x86):

  • NO_UNROLL:0.994 秒,编译器未选择 SSE
  • UNROLL:3.511 秒,使用 movups
  • SSE_UNROLL:3.315 秒,使用 movups
  • SSE_UNROLL_ALIGNED:3.276 秒,使用 movaps

很明显,展开循环在这种情况下没有帮助。即使确保我们使用更高效的 movaps 也无济于事。

但是在编译为 64 位 (x64) 时我得到了一个更奇怪的结果:

  • NO_UNROLL:1.138 秒,编译器未选择 SSE
  • UNROLL:1.409 秒,编译器未选择 SSE
  • SSE_UNROLL:1.420 秒,编译器仍然没有选择 SSE!
  • SSE_UNROLL_ALIGNED:1.476 秒,编译器仍然没有选择 SSE!

似乎 MSVC 看穿了提案并生成了更好的程序集,尽管仍然比我们根本没有尝试任何手动优化要慢。

关于c++ - SSE Intrinsics 和循环展开,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/35921512/

29 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com