gpt4 book ai didi

c++ - 如何将此代码重写为 sse 内在函数

转载 作者:太空宇宙 更新时间:2023-11-04 03:42:37 27 4
gpt4 key购买 nike

我是 sse 内在函数的新手,希望在使用这个 9 时得到一些提示帮助,因为这对我来说还很模糊)

我得到了这样的代码

for(int k=0; k<=n-4; k+=4) 
{

int xc0 = 512 + ((idx + k*iddx)>>6);
int yc0 = 512 + ((idy + k*iddy)>>6);

int xc1 = 512 + ((idx + (k+1)*iddx)>>6);
int yc1 = 512 + ((idy + (k+1)*iddy)>>6);

int xc2 = 512 + ((idx + (k+2)*iddx)>>6);
int yc2 = 512 + ((idy + (k+2)*iddy)>>6);

int xc3 = 512 + ((idx + (k+3)*iddx)>>6);
int yc3 = 512 + ((idy + (k+3)*iddy)>>6);

unsigned color0 = working_buffer[yc0*working_buffer_size_x + xc0];
unsigned color1 = working_buffer[yc1*working_buffer_size_x + xc1];
unsigned color2 = working_buffer[yc2*working_buffer_size_x + xc2];
unsigned color3 = working_buffer[yc3*working_buffer_size_x + xc3];

int adr = base_adr + k;

frame_bitmap[adr] = color0;
frame_bitmap[adr+1]= color1;
frame_bitmap[adr+2]= color2;
frame_bitmap[adr+3]= color3;
}

这里都是 int/unsigned,这是循环的关键部分,不确定整数 sse 是否有助于提高速度,但想知道它是否有效?有人可以帮忙吗?

(我使用的是 mingw32)

最佳答案

我的 sse 有点生疏,但你应该做的是:

xmm0: [k, k+1, k+2, k+3] //xc0, xc1,....
xmm1: [k, k+1, k+2, k+3] //yc0, yc1,....
//initialize before the loop
xmm2: [512, 512, 512, 512]
xmm3: [idx, idx, idx, idx]
xmm4: [iddx, iddx, iddx, iddx]
xmm5: [idy, idy, idy, idy]
xmm6: [iddy, iddy, iddy, iddy]
xmm7: [working_buffer_size_x, working_buffer_size_x, working_buffer_size_x, working_buffer_size_x]

计算:

xmm0 * xmm4
xmm0 + xmm3
xmm0 >> 6
xmm0 + xmm2

xmm0: [xc0, xc1, xc2, xc3]
///////////////////////////////

xmm1 * xmm6
xmm1 + xmm5
xmm1 >> 6
xmm1 + xmm2

xmm1: [yc0, yc1, yc2, yc3]

xmm1 * xmm7
xmm1 + xmm0

现在 xmm1 是:

xmm1: [yc0*working_buffer_size_x + xc0, yc1*working_buffer_size_x + xc1, yc2*working_buffer_size_x + xc2, yc3*working_buffer_size_x + xc3]

您在每个循环(working_buffer、frame_bitmap 数组)中读取和写入内存,这些操作比计算本身慢得多,因此速度提升不会像您预期的那样多。

编辑

您需要对齐 working_buffer 和 frame_bitmap 数组,并且 SSE4.1:

#include <emmintrin.h>
#include <smmintrin.h> //SSE4.1

int a[4] __attribute__((aligned(16)));
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

xmm2 = _mm_set1_epi32(512);
xmm3 = _mm_set1_epi32(idx);
xmm4 = _mm_set1_epi32(iddx);
xmm5 = _mm_set1_epi32(idy);
xmm6 = _mm_set1_epi32(iddy);
xmm7 = _mm_set1_epi32(working_buffer_size_x);

for(k = 0; k <= n - 4; k +=4){
xmm0 = _mm_set_epi32(k + 3, k + 2, k + 1, k);
xmm1 = _mm_set_epi32(k + 3, k + 2, k + 1, k);

//xmm0 * xmm4
xmm0 = _mm_mullo_epi32(xmm0, xmm4);

//xmm0 + xmm3
xmm0 = _mm_add_epi32(xmm0, xmm3);

//xmm0 >> 6
xmm0 = _mm_srai_epi32(xmm0, 6);

//xmm0 + xmm2
xmm0 = _mm_add_epi32(xmm0, xmm2);



//xmm1 * xmm6
xmm1 = _mm_mullo_epi32(xmm1, xmm6);

//xmm1 + xmm5
xmm1 = _mm_add_epi32(xmm1, xmm5);

//xmm1 >> 6
xmm1 = _mm_srai_epi32(xmm1, 6);

//xmm1 + xmm2
xmm1 = _mm_add_epi32(xmm1, xmm2);


//xmm1 * xmm7
xmm1 = _mm_mullo_epi32(xmm1, xmm7);
//xmm1 + xmm0
xmm1 = _mm_add_epi32(xmm1, xmm0);


//a[0] = yc0*working_buffer_size_x + xc0
//a[1] = yc1*working_buffer_size_x + xc1
//a[2] = yc2*working_buffer_size_x + xc2
//a[3] = yc3*working_buffer_size_x + xc3
_mm_store_si128((__m128i *)&a[0], xmm1);

unsigned color0 = working_buffer[ a[0] ];
unsigned color1 = working_buffer[ a[1] ];
unsigned color2 = working_buffer[ a[2] ];
unsigned color3 = working_buffer[ a[3] ];

int adr = base_adr + k;

frame_bitmap[adr] = color0;
frame_bitmap[adr+1]= color1;
frame_bitmap[adr+2]= color2;
frame_bitmap[adr+3]= color3;
}

您可以通过避免使用 _mm_store_si128((__m128i *)&a[0], xmm1);int adr = base_adr + k; 来进一步优化它直接操作内存进行组装。

关于c++ - 如何将此代码重写为 sse 内在函数,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/27333471/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com