gpt4 book ai didi

c - 使用sse的点生产

转载 作者:行者123 更新时间:2023-12-03 15:54:24 31 4
gpt4 key购买 nike

#define Size 50000

void main()
{

unsigned char *arry1 = (unsigned char*)malloc(sizeof(unsigned char)* Size);
unsigned char *arry2 = (unsigned char*)malloc(sizeof(unsigned char)* Size);
unsigned int *result = (unsigned int*)malloc(sizeof(unsigned int)* Size);


for (int i = 0; i < 16; i++)
{
arry1[i] = i;
arry2[i] = i;
}


__m128i Z = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
//__m128i dummy = _mm_setzero_si128();

for (int j = 0; j < 16; j += 16)
{
//printf("%d\n\n", j);

__m128i test1 = _mm_setzero_si128();
test1 = _mm_loadu_si128((__m128i*)&arry1[j]);
__m128i test2 = _mm_setzero_si128();
test2 = _mm_loadu_si128((__m128i*)&arry2[j]);

__m128i s16L = _mm_unpacklo_epi8(test1, Z);
__m128i s16H = _mm_unpackhi_epi8(test1, Z);
__m128i s32LL = _mm_unpacklo_epi16(s16L, Z);
__m128i s32LH = _mm_unpackhi_epi16(s16L, Z);
__m128i s32HL = _mm_unpacklo_epi16(s16H, Z);
__m128i s32HH = _mm_unpackhi_epi16(s16H, Z);

__m128i t16L = _mm_unpacklo_epi8(test2, Z);
__m128i t16H = _mm_unpackhi_epi8(test2, Z);
__m128i t32LL = _mm_unpacklo_epi16(t16L, Z);
__m128i t32LH = _mm_unpackhi_epi16(t16L, Z);
__m128i t32HL = _mm_unpacklo_epi16(t16H, Z);
__m128i t32HH = _mm_unpackhi_epi16(t16H, Z);

__m128 s1 = _mm_cvtepi32_ps(s32LL);
__m128 s2 = _mm_cvtepi32_ps(s32LH);
__m128 s3 = _mm_cvtepi32_ps(s32HL);
__m128 s4 = _mm_cvtepi32_ps(s32HH);

__m128 t1 = _mm_cvtepi32_ps(t32LL);
__m128 t2 = _mm_cvtepi32_ps(t32LH);
__m128 t3 = _mm_cvtepi32_ps(t32HL);
__m128 t4 = _mm_cvtepi32_ps(t32HH);

s1 = _mm_mul_ps(s1, t1);
s2 = _mm_mul_ps(s2, t2);
s3 = _mm_mul_ps(s3, t3);
s4 = _mm_mul_ps(s4, t4);

s1 = _mm_hadd_ps(s1, s2);//41,13
s3 = _mm_hadd_ps(s3, s4); //313,221

vsum = _mm_cvtps_epi32(s3);

for (int k = 0; k < 16; k++)
{
printf("%u\n", (unsigned char)vsum.m128i_i8[k]);
}

s1 = _mm_hadd_ps(s1, s3); //734, 14
s1 = _mm_hadd_ps(s1, s1); //1100,140
s1 = _mm_hadd_ps(s1, s1); //1240


}


}

我使用 sse 进行点生产。我使用的是 _mm_mul_ps_mm_hadd_ps 指令,而不是 _mm_dp_ps。如果_mm_hadd_ps函数后的值超过255,则显示错误值。

例如s3的正确值为{0,0,0,421,0,0,0,313,0,0,0,221,0,0,0,145} .但是打印了 {0,0,1,165,0,0,1,57,0,0,0,221,0,0,0,145}。这是我声明arry1,arry2unsigned char的结果吗?我知道 255 是 8 位的最大值。

最佳答案

我可以在这里看到一些问题:

1) 如果你想计算 50000 个 uint8_t 值的点积,就可以了。但是 70000 个值的点积会导致 uint32_t 类型溢出。因此,使用 uint64_t 是更好的解决方案。

2) 要计算整数 vector 的点积,不必使用 float 。仅用于计算整数更有效。

有一个 SSE2 函数的例子,它计算两个 uint8_t vector 的点积:

#include <algorithm>
#include <emmintrin.h>

const __m128i Z = _mm_setzero_si128();
const size_t A = sizeof(__m128i);
const size_t B = 0x10000;

inline __m128i DotProduct32(const uint8_t * a, const uint8_t * b)
{
__m128i _a = _mm_loadu_si128((__m128i*)a);
__m128i _b = _mm_loadu_si128((__m128i*)b);
__m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(_a, Z), _mm_unpacklo_epi8(_b, Z));
__m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(_a, Z), _mm_unpackhi_epi8(_b, Z));
return _mm_add_epi32(lo, hi);
}

inline __m128i HorizontalSum32(__m128i a)
{
return _mm_add_epi64(_mm_unpacklo_epi32(a, Z), _mm_unpackhi_epi32(a, Z));
}

inline uint64_t ExtractSum64(__m128i a)
{
uint64_t _a[2];
_mm_storeu_si128((__m128i*)_a, a);
return _a[0] + _a[1];
}

void DotProduct(const uint8_t * a, const uint8_t * b, size_t size, uint64_t * sum)
{
size_t blockNumber = (size + B - 1)/B;
size_t alignedSize = size/A*A;
size_t i = 0;

__m128i sum64 = Z;
for (size_t block = 0; block < blockNumber; ++i)
{
__m128i sum32 = Z;
for (size_t blockEnd = std::min(alignedSize, i + B); i < blockEnd; i += A)
sum32 = _mm_add_epi32(sum32, DotProduct32(a + i, b + i));
sum64 = _mm_add_epi64(sum64, HorizontalSum32(sum32));
}

*sum = ExtractSum64(sum64);
for (; i < size; ++i)
*sum += a[i] * b[i];
}

关于c - 使用sse的点生产,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/42537818/

31 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com