gpt4 book ai didi

c - 如何将 3x3 的卷积核与图像相乘

转载 作者:行者123 更新时间:2023-11-30 21:40:40 27 4
gpt4 key购买 nike

有一个 3x3 的卷积核和一个由整数值像素数组表示的图像。

卷积核表示如下:

//compound convolutional kernels
// | 1, 0, 1|
// convolutional kernel H = src x | 0, 0, 0|
// |-1, 0, -1|

// | 1, 0, -1|
// convolutional kernel V = src x | 0, 0, 0|
// | 1, 0, -1|

卷积核 = 核 H + 核 V

for(int inc=0; inc<height-2; inc++)
{
//loaded 3 lines into memory
str1_16pxs = _mm_loadu_si128((__m128i*)(src_all_str));
str2_16pxs = _mm_loadu_si128((__m128i*)(src2_all_str));
str3_16pxs = _mm_loadu_si128((__m128i*)(src3_all_str));

//packing 16bit
str1_16pxs_pack1st_8to16 = _mm_cvtepu8_epi16(str1_16pxs);
str2_16pxs_pack1st_8to16 = _mm_cvtepu8_epi16(str2_16pxs);
str3_16pxs_pack1st_8to16 = _mm_cvtepu8_epi16(str3_16pxs);

//---!
//there is we make the first convolution for 8px's
//... How ???
//---

//summ 1st 8to16 vertical registers
sum1_str12_vert_16pxs_pack1st_8to16 = _mm_add_epi16(str1_16pxs_pack1st_8to16, str2_16pxs_pack1st_8to16);
sum1_str123_vert_16pxs_pack1st_8to16 = _mm_add_epi16(sum1_str12_vert_16pxs_pack1st_8to16,str3_16pxs_pack1st_8to16);

for(int jnc=0; jnc<(width >> 4); jnc++)
{
str1_16pxs_plus_8pxs = _mm_srli_si128(str1_16pxs, 8);
str2_16pxs_plus_8pxs = _mm_srli_si128(str2_16pxs, 8);
str3_16pxs_plus_8pxs = _mm_srli_si128(str3_16pxs, 8);

//pack 2nd 8to16 registers (+8px's)
str1_16pxs_pack2nd_8to16 = _mm_cvtepu8_epi16(str1_16pxs_plus_8pxs);
str2_16pxs_pack2nd_8to16 = _mm_cvtepu8_epi16(str2_16pxs_plus_8pxs);
str3_16pxs_pack2nd_8to16 = _mm_cvtepu8_epi16(str3_16pxs_plus_8pxs);

//---!
//do convolution for the remaining 8px's and so on until the end of the read line
//... How ???
//---

//summ vertic 8to16 registers
sum1_str12_vert_16pxs_pack2nd_8to16 = _mm_add_epi16(str1_16pxs_pack2nd_8to16, str2_16pxs_pack2nd_8to16);
sum1_str123_vert_16pxs_pack2nd_8to16 = _mm_add_epi16(sum1_str12_vert_16pxs_pack2nd_8to16,str3_16pxs_pack2nd_8to16);

//---!4 loading next 16 px's
src_all_str += 16;
src2_all_str += 16;
src3_all_str += 16;

//...

_mm_store_si128((__m128i*)(dst_all_str), res);
dst_all_str += 8;

}//for(jnc)

}//for(inc)

最佳答案

所以,示例代码:

void SSEcode_Conv3x3 (unsigned char *src, int width, int height, short *dst) 
{
// Assert that width is a multiple of 16
if (width & 0xF) return;

unsigned char* src_line1 = src;
unsigned char* src_line3 = src + 2 * width;

__m128i zero = _mm_setzero_si128();

for (int i = 0; i < height - 2; i++)
{
__m128i line1 = _mm_load_si128((__m128i*)src_line1);
__m128i line3 = _mm_load_si128((__m128i*)src_line3);
for (int j = 0; j < width / 16 - 1; j++)
{
src_line1 += 16;
src_line3 += 16;

__m128i line1next = _mm_load_si128((__m128i*)src_line1);
__m128i line3next = _mm_load_si128((__m128i*)src_line3);

//blablabla
#ifdef USE_CORE_H
_mm_add_epi16
_mm_add_epi16
_mm_sub_epi16
#endif
//blablabla

_mm_store_si128((__m128i*)(dst + 8), res);
line1 = line1next;
line3 = line3next;

dst += 16;
}//for (j)

src_line1 += 16;
src_line3 += 16;

//blablabla

_mm_store_si128((__m128i*)(dst + 8), res);
dst += 16;
}//for (i)

}

写代码花了很长时间。我是新人,很遗憾精通CE的人没有帮助解决本质问题。:(

关于c - 如何将 3x3 的卷积核与图像相乘,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50413020/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com