gpt4 book ai didi

c++ - 使用 ARM NEON 内部函数对 cvtColor 进行 SIMD 优化

转载 作者:可可西里 更新时间:2023-11-01 16:04:45 24 4
gpt4 key购买 nike

我正在研究 BGR 到灰度转换的 SIMD 优化,相当于 OpenCV's cvtColor() function .这个函数有一个 Intel SSE 版本,我指的是它。 (我所做的基本上是将 SSE 代码转换为 NEON 代码。)

我快写完代码了,可以用g++编译了,但是我得不到正确的输出。有谁知道错误可能是什么?

我得到的(不正确的):

my output

我应该得到的:

properly converted output

这是我的代码:

#include <opencv/cv.hpp>
#include <opencv/highgui.h>
#include <arm_neon.h>
//#include <iostream>

using namespace std;
//using namespace cv;

#define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })

void cvtBGR2GrayNEON(cv::Mat& src, cv::Mat& dest)
{
const int size = src.size().area()*src.channels();
uchar* s = src.ptr<uchar>(0);
uchar* d = dest.ptr<uchar>(0);

const int8x16_t mask1 = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
const int8x16_t smask1 = {6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15};
const int8x16_t ssmask1 = {11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10};

const int8x16_t mask2 = {0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13};
const int8x16_t ssmask2 = {0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10};

const int8x16_t bmask1 = {255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0};
const int8x16_t bmask2 = {255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0};
const int8x16_t bmask3 = {255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0};
const int8x16_t bmask4 = {255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0};

const int shift = 8;
const int amp = 1<<shift;

const int16_t _R_ = (int16_t)(amp*0.299);
const int16_t _G_ = (int16_t)(amp*0.587);
const int16_t _B_ = (int16_t)(amp*0.114);
const int16x8_t R = vdupq_n_s16(_R_);
const int16x8_t G = vdupq_n_s16(_G_);
const int16x8_t B = vdupq_n_s16(_B_);
const int8x16_t zero = vdupq_n_s8(0);

for(int i = 0; i < size; i += 48)
{
int8x16_t a = vld1q_s8((int8_t *) s + i);
int8x16_t b = vld1q_s8((int8_t *) s + i + 16);
int8x16_t c = vld1q_s8((int8_t *) s + i + 32);

a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a),vget_low_s8(mask1)),vtbl2_s8(int8x16_to_8x8x2(a),vget_high_s8(mask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(mask2)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(mask2)));

//BBBBBB
const int8x16_t aaaa = vbslq_s8(c, vbslq_s8(b, a, bmask1), bmask2);

a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(smask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(smask1)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(smask1)));

//GGGGGG
const int8x16_t bbbb = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask2);

a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(ssmask1)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(ssmask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(ssmask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(ssmask2)));

//RRRRRR
const int8x16_t cccc = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask4);

/*
int8x8x2_t a1 = vzip_s8(vget_high_s8(aaaa), vget_high_s8(zero));
int8x8x2_t a2 = vzip_s8(vget_low_s8(aaaa), vget_low_s8(zero));
*/

int8x16_t a1 = aaaa;
int8x16_t a2 = zero;
int8x16x2_t temp1 = vzipq_s8(a1, a2);
a1 = temp1.val[0];
a2 = temp1.val[1];
int16x8_t aa1 = vmulq_s16((int16x8_t)a2, B);
int16x8_t aa2 = vmulq_s16((int16x8_t)a1, B);

int8x16_t b1 = bbbb;
int8x16_t b2 = zero;
int8x16x2_t temp2 = vzipq_s8(b1, b2);
b1 = temp2.val[0];
b2 = temp2.val[1];
int16x8_t bb1 = vmulq_s16((int16x8_t)b2, G);
int16x8_t bb2 = vmulq_s16((int16x8_t)b1, G);

int8x16_t c1 = cccc;
int8x16_t c2 = zero;
int8x16x2_t temp3 = vzipq_s8(c1, c2);
c1 = temp3.val[0];
c2 = temp3.val[1];
int16x8_t cc1 = vmulq_s16((int16x8_t)c2, R);
int16x8_t cc2 = vmulq_s16((int16x8_t)c1, R);

aa1 = vaddq_s16(aa1, bb1);
aa1 = vaddq_s16(aa1, cc1);
aa2 = vaddq_s16(aa2, bb2);
aa2 = vaddq_s16(aa2, cc2);

const int shift1 = 8;
aa1 = vshrq_n_s16(aa1, shift1);
aa2 = vshrq_n_s16(aa2, shift1);

uint8x8_t aaa1 = vqmovun_s16(aa1);
uint8x8_t aaa2 = vqmovun_s16(aa2);

uint8x16_t result = vcombine_u8(aaa1, aaa2);

vst1q_u8((uint8_t *)(d), result);

d+=16;
}
}

int main()
{
cv::Mat src = cv::imread("Lenna.bmp");
cv::Mat dest(src.rows, src.cols, CV_8UC1);

cvtBGR2GrayNEON(src, dest);

cv::imwrite("grey.jpg", dest);

return 0;
}

这是等效的 SSE 代码(来自 here):

void cvtBGR2GraySSEShort(Mat& src, Mat& dest)
{
const int size = src.size().area()*src.channels();
uchar* s = src.ptr<uchar>(0);
uchar* d = dest.ptr<uchar>(0);

//data structure
//BGR BGR BGR BGR BGR B
//GR BGR BGR BGR BGR BG
//R BGR BGR BGR BGR BGR
//shuffle to BBBBBBGGGGGRRRRR
const __m128i mask1 = _mm_setr_epi8(0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14);
const __m128i smask1 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);
const __m128i ssmask1 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);

//shuffle to GGGGGGBBBBBRRRRR
const __m128i mask2 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);
//const __m128i smask2 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);same as smask1
const __m128i ssmask2 = _mm_setr_epi8(0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10);

//shuffle to RRRRRRGGGGGBBBBB
//__m128i mask3 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);//same as mask2
//const __m128i smask3 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,6,7,8,9,10);//same as smask1
//const __m128i ssmask3 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);//same as ssmask1

//blend mask
const __m128i bmask1 = _mm_setr_epi8
(255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0);

const __m128i bmask2 = _mm_setr_epi8
(255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0);

const __m128i bmask3 = _mm_setr_epi8
(255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0);

const __m128i bmask4 = _mm_setr_epi8
(255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0);

const int shift = 8;
const int amp = 1<<shift;
const int _R_=(int)(amp*0.299);
const int _G_=(int)(amp*0.587);
const int _B_=(int)(amp*0.114);
const __m128i R = _mm_set1_epi16(_R_);
const __m128i G = _mm_set1_epi16(_G_);
const __m128i B = _mm_set1_epi16(_B_);
const __m128i zero = _mm_setzero_si128();

for(int i=0;i<size;i+=48)
{
__m128i a = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i)),mask1);
__m128i b = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+16)),mask2);
__m128i c = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+32)),mask2);
const __m128i aaaa = _mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask1),bmask2);

a = _mm_shuffle_epi8(a,smask1);
b = _mm_shuffle_epi8(b,smask1);
c = _mm_shuffle_epi8(c,smask1);
const __m128i bbbb =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask2);

a = _mm_shuffle_epi8(a,ssmask1);
c = _mm_shuffle_epi8(c,ssmask1);
b = _mm_shuffle_epi8(b,ssmask2);
const __m128i cccc =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask4);

__m128i a1 = _mm_unpackhi_epi8(aaaa,zero);
__m128i a2 = _mm_unpacklo_epi8(aaaa,zero);
a1 = _mm_mullo_epi16(a1,B);
a2 = _mm_mullo_epi16(a2,B);
__m128i b1 = _mm_unpackhi_epi8(bbbb,zero);
__m128i b2 = _mm_unpacklo_epi8(bbbb,zero);
b1 = _mm_mullo_epi16(b1,G);
b2 = _mm_mullo_epi16(b2,G);

__m128i c1 = _mm_unpackhi_epi8(cccc,zero);
__m128i c2 = _mm_unpacklo_epi8(cccc,zero);
c1 = _mm_mullo_epi16(c1,R);
c2 = _mm_mullo_epi16(c2,R);

a1 = _mm_add_epi16(a1,b1);
a1 = _mm_add_epi16(a1,c1);
a2 = _mm_add_epi16(a2,b2);
a2 = _mm_add_epi16(a2,c2);

a1 = _mm_srli_epi16(a1,8);
a2 = _mm_srli_epi16(a2,8);

a = _mm_packus_epi16(a1,a2);

_mm_stream_si128((__m128i*)(d),a);
d+=16;
}
}

最佳答案

好的,下面是我刚刚编写的函数的完全优化版本(请注意,如果大小小于 32,此函数只会返回。)

/*
* Created on: 2014. 7. 27.
* Author: Jake Lee
* Project FANIC - Fastest ARM NEON Implementaion Challenge
*/

// void fanicCvtBGR2GrayNEON(void *pDst, void *pSrc, unsigned int size);
// Y = 0.114*B + 0.587*G + 0.299*R
.text
.arm
.global fanicCvtBGR2GrayNEON

pDst .req r0
pSrc .req r1
size .req r2

.align 5
.func
fanicCvtBGR2GrayNEON:
pld [pSrc]
subs size, size, #32
pld [pSrc, #64]
bxmi lr
pld [pSrc, #64*2]
vmov.i8 d0, #29
vmov.i8 d1, #150
vmov.i8 d2, #77

.align 5
1:
vld3.8 {d20, d21, d22}, [pSrc]!
vld3.8 {d23, d24, d25}, [pSrc]!
vld3.8 {d26, d27, d28}, [pSrc]!
vld3.8 {d29, d30, d31}, [pSrc]!

vmull.u8 q8, d20, d0
vmlal.u8 q8, d21, d1
vmlal.u8 q8, d22, d2
vmull.u8 q9, d23, d0
vmlal.u8 q9, d24, d1
vmlal.u8 q9, d25, d2
vmull.u8 q10, d26, d0
vmlal.u8 q10, d27, d1
vmlal.u8 q10, d28, d2
vmull.u8 q11, d29, d0
vmlal.u8 q11, d30, d1
vmlal.u8 q11, d31, d2

vrshrn.u16 d24, q8, #8
vrshrn.u16 d25, q9, #8
vrshrn.u16 d26, q10, #8
vrshrn.u16 d27, q11, #8

subs size, size, #32
pld [pSrc, #64*3]
pld [pSrc, #64*4]

vst1.8 {q12, q13}, [pDst]!
bpl 1b

cmp size, #-32
add pSrc, pSrc, size
bxle lr
add pSrc, pSrc, size, lsl #1
add pDst, pDst, size
b 1b

.endfunc
.end

如您所见,尽管展开繁重,但在汇编中编写 NEON 代码比在内部函数中编写更容易、更短。

玩得开心。

关于c++ - 使用 ARM NEON 内部函数对 cvtColor 进行 SIMD 优化,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/24977272/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com