gpt4 book ai didi

c++ - 加速一个短到 float 的转换?

转载 作者:可可西里 更新时间:2023-11-01 18:41:21 26 4
gpt4 key购买 nike

我在 C++ 中有一个 short to float cast,它是我的代码的瓶颈。

代码从硬件设备缓冲区转换而来,该缓冲区本身是短路的,这代表来自奇特的光子计数器的输入。

float factor=  1.0f/value;
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
int value = source[i];//ushort -> int
destination[i] = value*factor;//int*float->float
}

一些细节

  1. 取值范围为 0 到 2^16-1,表示高灵敏度相机的像素值

  2. 我在一台配备 i7 处理器(i7 960,即 SSE 4.2 和 4.1)的多核 x86 机器上。

  3. 源与 8 位边界对齐(硬件设备的要求)

  4. W*H总是能被8整除,大部分时候W和H都能被8整除

这让我很难过,我能做些什么吗?

我正在使用 Visual Studios 2012...

最佳答案

这是一个基本的 SSE4.1 实现:

__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < W*H; i += 8)
{
// Load 8 16-bit ushorts.
// vi = {a,b,c,d,e,f,g,h}
__m128i vi = _mm_load_si128((const __m128i*)(source + i));

// Convert to 32-bit integers
// vi0 = {a,0,b,0,c,0,d,0}
// vi1 = {e,0,f,0,g,0,h,0}
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);

// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);

// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}

假设:

  1. sourcedestination 都对齐到 16 字节。
  2. W*H 是 8 的倍数。

进一步展开此循环可能会做得更好。 (见下文)


这里的思路是这样的:

  1. 将 8 条短裤加载到单个 SSE 寄存器中。
  2. 将寄存器分成两部分:一个包含底部的 4 个短裤,另一个包含顶部的 4 个短裤。
  3. 将两个寄存器零扩展为 32 位整数。
  4. 将它们都转换为float
  5. 乘以系数。
  6. 将它们存储到目的地

编辑:

我已经有一段时间没有进行此类优化了,所以我继续展开循环。

酷睿 i7 920 @ 3.5 GHz
Visual Studio 2012 - 发布 x64:

Original Loop      : 4.374 seconds
Vectorize no unroll: 1.665
Vectorize unroll 2 : 1.416

进一步展开导致 yield 递减。

测试代码如下:

#include <smmintrin.h>
#include <time.h>
#include <iostream>
#include <malloc.h>
using namespace std;


void default_loop(float *destination,const short* source,float value,int size){
float factor = 1.0f / value;
for (int i = 0; i < size; i++)
{
int value = source[i];
destination[i] = value*factor;
}
}
void vectorize8_unroll1(float *destination,const short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));

// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));

// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);

// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);

// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2(float *destination,const short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));

// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);

// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);

// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);

// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);

// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void print_sum(const float *destination,int size){
float sum = 0;
for (int i = 0; i < size; i++){
sum += destination[i];
}
cout << sum << endl;
}

int main(){

int size = 8000;

short *source = (short*)_mm_malloc(size * sizeof(short), 16);
float *destination = (float*)_mm_malloc(size * sizeof(float), 16);

for (int i = 0; i < size; i++){
source[i] = i;
}

float value = 1.1;

int iterations = 1000000;
clock_t start;

// Default Loop
start = clock();
for (int it = 0; it < iterations; it++){
default_loop(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);

// Vectorize 8, no unroll
start = clock();
for (int it = 0; it < iterations; it++){
vectorize8_unroll1(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);

// Vectorize 8, unroll 2
start = clock();
for (int it = 0; it < iterations; it++){
vectorize8_unroll2(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);

_mm_free(source);
_mm_free(destination);

system("pause");
}

关于c++ - 加速一个短到 float 的转换?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/16031149/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com