gpt4 book ai didi

c++ - 使用 SIMD 将 10 位值打包成字节流

转载 作者:塔克拉玛干 更新时间:2023-11-03 00:15:03 24 4
gpt4 key购买 nike

<分区>

我正在尝试使用 SIMD 指令将 10 位像素打包成连续的字节流。下面的代码“原则上”执行此操作,但 SIMD 版本比标量版本慢。

问题似乎是我找不到可以有效加载寄存器的良好收集/分散操作。

有什么改进建议吗?

// SIMD_test.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"

#include "Windows.h"
#include <tmmintrin.h>
#include <stdint.h>
#include <string.h>

// reference non-SIMD implementation that "works"
// 4 uint16 at a time as input, and 5 uint8 as output per loop iteration

void packSlow(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
for(uint32_t j=0;j<NCOL;j+=4)
{
streamBuffer[0] = (uint8_t)(ptr[0]);
streamBuffer[1] = (uint8_t)(((ptr[0]&0x3FF)>>8) | ((ptr[1]&0x3F) <<2));
streamBuffer[2] = (uint8_t)(((ptr[1]&0x3FF)>>6) | ((ptr[2]&0x0F) <<4));
streamBuffer[3] = (uint8_t)(((ptr[2]&0x3FF)>>4) | ((ptr[3]&0x03) <<6));
streamBuffer[4] = (uint8_t)((ptr[3]&0x3FF)>>2) ;
streamBuffer += 5;
ptr += 4;
}
}


// poorly written SIMD implementation. Attempts to do the same
// as the packSlow, but 8 iterations at a time

void packFast(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
const __m128i maskb = _mm_set_epi16(0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F);
const __m128i maskc = _mm_set_epi16(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F);
const __m128i maskd = _mm_set_epi16(0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03);

for(uint32_t j=0;j<NCOL;j+=4*8)
{
_mm_prefetch((const char*)(ptr+j),_MM_HINT_T0);
}

for(uint32_t j=0;j<NCOL;j+=4*8)
{
// this "fetch" stage is costly. Each term takes 2 cycles
__m128i ptr0 = _mm_set_epi16(ptr[0],ptr[4],ptr[8],ptr[12],ptr[16],ptr[20],ptr[24],ptr[28]);
__m128i ptr1 = _mm_set_epi16(ptr[1],ptr[5],ptr[9],ptr[13],ptr[17],ptr[21],ptr[25],ptr[29]);
__m128i ptr2 = _mm_set_epi16(ptr[2],ptr[6],ptr[10],ptr[14],ptr[18],ptr[22],ptr[26],ptr[30]);
__m128i ptr3 = _mm_set_epi16(ptr[3],ptr[7],ptr[11],ptr[15],ptr[19],ptr[23],ptr[27],ptr[31]);

// I think this part is fairly well optimized
__m128i streamBuffer0 = ptr0;
__m128i streamBuffer1 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr0 , maska), _mm_set_epi32(0, 0, 0,8)) , _mm_sll_epi16 (_mm_and_si128 (ptr1 , maskb) , _mm_set_epi32(0, 0, 0,2)));
__m128i streamBuffer2 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr1 , maska), _mm_set_epi32(0, 0, 0,6)) , _mm_sll_epi16 (_mm_and_si128 (ptr2 , maskc) , _mm_set_epi32(0, 0, 0,4)));
__m128i streamBuffer3 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr2 , maska), _mm_set_epi32(0, 0, 0,4)) , _mm_sll_epi16 (_mm_and_si128 (ptr3 , maskd) , _mm_set_epi32(0, 0, 0,6)));
__m128i streamBuffer4 = _mm_srl_epi16 (_mm_and_si128 (ptr3 , maska), _mm_set_epi32(0, 0, 0,2)) ;

// this again is terribly slow. ~2 cycles per byte output
for(int j=15;j>=0;j-=2)
{
streamBuffer[0] = streamBuffer0.m128i_u8[j];
streamBuffer[1] = streamBuffer1.m128i_u8[j];
streamBuffer[2] = streamBuffer2.m128i_u8[j];
streamBuffer[3] = streamBuffer3.m128i_u8[j];
streamBuffer[4] = streamBuffer4.m128i_u8[j];
streamBuffer += 5;
}
ptr += 32;
}

}

int _tmain(int argc, _TCHAR* argv[])
{

uint16_t pixels[512];
uint8_t packed1[512*10/8];
uint8_t packed2[512*10/8];

for(int i=0;i<512;i++)
{
pixels[i] = i;
}

LARGE_INTEGER t0,t1,t2;

QueryPerformanceCounter(&t0);
for(int k=0;k<1000;k++) packSlow(pixels,packed1,512);
QueryPerformanceCounter(&t1);
for(int k=0;k<1000;k++) packFast(pixels,packed2,512);
QueryPerformanceCounter(&t2);

printf("%d %d\n",t1.QuadPart-t0.QuadPart,t2.QuadPart-t1.QuadPart);

if (memcmp(packed1,packed2,sizeof(packed1)))
{
printf("failed\n");
}


return 0;
}

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com