- android - RelativeLayout 背景可绘制重叠内容
- android - 如何链接 cpufeatures lib 以获取 native android 库?
- java - OnItemClickListener 不起作用,但 OnLongItemClickListener 在自定义 ListView 中起作用
- java - Android 文件转字符串
我正在研究 BGR 到灰度转换的 SIMD 优化,相当于 OpenCV's cvtColor()
function .这个函数有一个 Intel SSE 版本,我指的是它。 (我所做的基本上是将 SSE 代码转换为 NEON 代码。)
我快写完代码了,可以用g++编译了,但是我得不到正确的输出。有谁知道错误可能是什么?
我得到的(不正确的):
我应该得到的:
这是我的代码:
#include <opencv/cv.hpp>
#include <opencv/highgui.h>
#include <arm_neon.h>
//#include <iostream>
using namespace std;
//using namespace cv;
#define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })
void cvtBGR2GrayNEON(cv::Mat& src, cv::Mat& dest)
{
const int size = src.size().area()*src.channels();
uchar* s = src.ptr<uchar>(0);
uchar* d = dest.ptr<uchar>(0);
const int8x16_t mask1 = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
const int8x16_t smask1 = {6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15};
const int8x16_t ssmask1 = {11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10};
const int8x16_t mask2 = {0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13};
const int8x16_t ssmask2 = {0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10};
const int8x16_t bmask1 = {255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0};
const int8x16_t bmask2 = {255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0};
const int8x16_t bmask3 = {255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0};
const int8x16_t bmask4 = {255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0};
const int shift = 8;
const int amp = 1<<shift;
const int16_t _R_ = (int16_t)(amp*0.299);
const int16_t _G_ = (int16_t)(amp*0.587);
const int16_t _B_ = (int16_t)(amp*0.114);
const int16x8_t R = vdupq_n_s16(_R_);
const int16x8_t G = vdupq_n_s16(_G_);
const int16x8_t B = vdupq_n_s16(_B_);
const int8x16_t zero = vdupq_n_s8(0);
for(int i = 0; i < size; i += 48)
{
int8x16_t a = vld1q_s8((int8_t *) s + i);
int8x16_t b = vld1q_s8((int8_t *) s + i + 16);
int8x16_t c = vld1q_s8((int8_t *) s + i + 32);
a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a),vget_low_s8(mask1)),vtbl2_s8(int8x16_to_8x8x2(a),vget_high_s8(mask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(mask2)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(mask2)));
//BBBBBB
const int8x16_t aaaa = vbslq_s8(c, vbslq_s8(b, a, bmask1), bmask2);
a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(smask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(smask1)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(smask1)));
//GGGGGG
const int8x16_t bbbb = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask2);
a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(ssmask1)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(ssmask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(ssmask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(ssmask2)));
//RRRRRR
const int8x16_t cccc = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask4);
/*
int8x8x2_t a1 = vzip_s8(vget_high_s8(aaaa), vget_high_s8(zero));
int8x8x2_t a2 = vzip_s8(vget_low_s8(aaaa), vget_low_s8(zero));
*/
int8x16_t a1 = aaaa;
int8x16_t a2 = zero;
int8x16x2_t temp1 = vzipq_s8(a1, a2);
a1 = temp1.val[0];
a2 = temp1.val[1];
int16x8_t aa1 = vmulq_s16((int16x8_t)a2, B);
int16x8_t aa2 = vmulq_s16((int16x8_t)a1, B);
int8x16_t b1 = bbbb;
int8x16_t b2 = zero;
int8x16x2_t temp2 = vzipq_s8(b1, b2);
b1 = temp2.val[0];
b2 = temp2.val[1];
int16x8_t bb1 = vmulq_s16((int16x8_t)b2, G);
int16x8_t bb2 = vmulq_s16((int16x8_t)b1, G);
int8x16_t c1 = cccc;
int8x16_t c2 = zero;
int8x16x2_t temp3 = vzipq_s8(c1, c2);
c1 = temp3.val[0];
c2 = temp3.val[1];
int16x8_t cc1 = vmulq_s16((int16x8_t)c2, R);
int16x8_t cc2 = vmulq_s16((int16x8_t)c1, R);
aa1 = vaddq_s16(aa1, bb1);
aa1 = vaddq_s16(aa1, cc1);
aa2 = vaddq_s16(aa2, bb2);
aa2 = vaddq_s16(aa2, cc2);
const int shift1 = 8;
aa1 = vshrq_n_s16(aa1, shift1);
aa2 = vshrq_n_s16(aa2, shift1);
uint8x8_t aaa1 = vqmovun_s16(aa1);
uint8x8_t aaa2 = vqmovun_s16(aa2);
uint8x16_t result = vcombine_u8(aaa1, aaa2);
vst1q_u8((uint8_t *)(d), result);
d+=16;
}
}
int main()
{
cv::Mat src = cv::imread("Lenna.bmp");
cv::Mat dest(src.rows, src.cols, CV_8UC1);
cvtBGR2GrayNEON(src, dest);
cv::imwrite("grey.jpg", dest);
return 0;
}
这是等效的 SSE 代码(来自 here):
void cvtBGR2GraySSEShort(Mat& src, Mat& dest)
{
const int size = src.size().area()*src.channels();
uchar* s = src.ptr<uchar>(0);
uchar* d = dest.ptr<uchar>(0);
//data structure
//BGR BGR BGR BGR BGR B
//GR BGR BGR BGR BGR BG
//R BGR BGR BGR BGR BGR
//shuffle to BBBBBBGGGGGRRRRR
const __m128i mask1 = _mm_setr_epi8(0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14);
const __m128i smask1 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);
const __m128i ssmask1 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);
//shuffle to GGGGGGBBBBBRRRRR
const __m128i mask2 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);
//const __m128i smask2 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);same as smask1
const __m128i ssmask2 = _mm_setr_epi8(0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10);
//shuffle to RRRRRRGGGGGBBBBB
//__m128i mask3 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);//same as mask2
//const __m128i smask3 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,6,7,8,9,10);//same as smask1
//const __m128i ssmask3 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);//same as ssmask1
//blend mask
const __m128i bmask1 = _mm_setr_epi8
(255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0);
const __m128i bmask2 = _mm_setr_epi8
(255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0);
const __m128i bmask3 = _mm_setr_epi8
(255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0);
const __m128i bmask4 = _mm_setr_epi8
(255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0);
const int shift = 8;
const int amp = 1<<shift;
const int _R_=(int)(amp*0.299);
const int _G_=(int)(amp*0.587);
const int _B_=(int)(amp*0.114);
const __m128i R = _mm_set1_epi16(_R_);
const __m128i G = _mm_set1_epi16(_G_);
const __m128i B = _mm_set1_epi16(_B_);
const __m128i zero = _mm_setzero_si128();
for(int i=0;i<size;i+=48)
{
__m128i a = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i)),mask1);
__m128i b = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+16)),mask2);
__m128i c = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+32)),mask2);
const __m128i aaaa = _mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask1),bmask2);
a = _mm_shuffle_epi8(a,smask1);
b = _mm_shuffle_epi8(b,smask1);
c = _mm_shuffle_epi8(c,smask1);
const __m128i bbbb =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask2);
a = _mm_shuffle_epi8(a,ssmask1);
c = _mm_shuffle_epi8(c,ssmask1);
b = _mm_shuffle_epi8(b,ssmask2);
const __m128i cccc =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask4);
__m128i a1 = _mm_unpackhi_epi8(aaaa,zero);
__m128i a2 = _mm_unpacklo_epi8(aaaa,zero);
a1 = _mm_mullo_epi16(a1,B);
a2 = _mm_mullo_epi16(a2,B);
__m128i b1 = _mm_unpackhi_epi8(bbbb,zero);
__m128i b2 = _mm_unpacklo_epi8(bbbb,zero);
b1 = _mm_mullo_epi16(b1,G);
b2 = _mm_mullo_epi16(b2,G);
__m128i c1 = _mm_unpackhi_epi8(cccc,zero);
__m128i c2 = _mm_unpacklo_epi8(cccc,zero);
c1 = _mm_mullo_epi16(c1,R);
c2 = _mm_mullo_epi16(c2,R);
a1 = _mm_add_epi16(a1,b1);
a1 = _mm_add_epi16(a1,c1);
a2 = _mm_add_epi16(a2,b2);
a2 = _mm_add_epi16(a2,c2);
a1 = _mm_srli_epi16(a1,8);
a2 = _mm_srli_epi16(a2,8);
a = _mm_packus_epi16(a1,a2);
_mm_stream_si128((__m128i*)(d),a);
d+=16;
}
}
最佳答案
好的,下面是我刚刚编写的函数的完全优化版本(请注意,如果大小小于 32,此函数只会返回。)
/*
* Created on: 2014. 7. 27.
* Author: Jake Lee
* Project FANIC - Fastest ARM NEON Implementaion Challenge
*/
// void fanicCvtBGR2GrayNEON(void *pDst, void *pSrc, unsigned int size);
// Y = 0.114*B + 0.587*G + 0.299*R
.text
.arm
.global fanicCvtBGR2GrayNEON
pDst .req r0
pSrc .req r1
size .req r2
.align 5
.func
fanicCvtBGR2GrayNEON:
pld [pSrc]
subs size, size, #32
pld [pSrc, #64]
bxmi lr
pld [pSrc, #64*2]
vmov.i8 d0, #29
vmov.i8 d1, #150
vmov.i8 d2, #77
.align 5
1:
vld3.8 {d20, d21, d22}, [pSrc]!
vld3.8 {d23, d24, d25}, [pSrc]!
vld3.8 {d26, d27, d28}, [pSrc]!
vld3.8 {d29, d30, d31}, [pSrc]!
vmull.u8 q8, d20, d0
vmlal.u8 q8, d21, d1
vmlal.u8 q8, d22, d2
vmull.u8 q9, d23, d0
vmlal.u8 q9, d24, d1
vmlal.u8 q9, d25, d2
vmull.u8 q10, d26, d0
vmlal.u8 q10, d27, d1
vmlal.u8 q10, d28, d2
vmull.u8 q11, d29, d0
vmlal.u8 q11, d30, d1
vmlal.u8 q11, d31, d2
vrshrn.u16 d24, q8, #8
vrshrn.u16 d25, q9, #8
vrshrn.u16 d26, q10, #8
vrshrn.u16 d27, q11, #8
subs size, size, #32
pld [pSrc, #64*3]
pld [pSrc, #64*4]
vst1.8 {q12, q13}, [pDst]!
bpl 1b
cmp size, #-32
add pSrc, pSrc, size
bxle lr
add pSrc, pSrc, size, lsl #1
add pDst, pDst, size
b 1b
.endfunc
.end
如您所见,尽管展开繁重,但在汇编中编写 NEON 代码比在内部函数中编写更容易、更短。
玩得开心。
关于c++ - 使用 ARM NEON 内部函数对 cvtColor 进行 SIMD 优化,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/24977272/
关闭。这个问题不符合Stack Overflow guidelines .它目前不接受答案。 想改进这个问题?将问题更新为 on-topic对于堆栈溢出。 7年前关闭。 Improve this qu
我有一个代码库,我可以在我的 mac 上编译和运行,但不能在我的远程 linux 机器上编译和运行,我不确定为什么。 编译时出现错误 fatal error: simd/simd.h: No such
我需要了解如何编写一些可并行化问题的 C++ 跨平台实现,以便在可用的情况下利用 SIMD(SSE、SPU 等)。以及我希望能够在运行时在 SIMD 和非 SIMD 之间切换。 您建议我如何解决这个问
我正在使用 AVX 内在 _mm256_extract_epi32() . 不过,我不完全确定我是否正确使用它,因为 gcc 不喜欢我的代码,而 clang 编译它并运行它没有问题。 我根据整数变量的
当我可以使用 SSE3 或 AVX 时,SSE2 或 MMX 等较旧的 SSE 版本是否可用 - 还是我还需要单独检查它们? 最佳答案 一般来说,这些都是附加的,但请记住,多年来英特尔和 AMD 对这
在 godbolt.org 使用 gcc 7.2 我可以看到以下内容 code在汇编程序中翻译得非常好。我看到 1 次加载、1 次添加和 1 次存储。 #include __attribute__(
假设我们有一个函数将两个数组相乘,每个数组有 1000000 个 double 值。在 C/C++ 中,该函数如下所示: void mul_c(double* a, double* b) {
我有一个 A = a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4 我有两排, float32x2_t a = a1 a2 flo
我正在考虑编写一个 SIMD vector 数学库,因此作为一个快速基准,我编写了一个程序,该程序执行 1 亿(4 个 float ) vector 元素乘法并将它们加到累积总数中。对于我的经典非 S
我正在开发带有英特尔编译器 OpenMP 4.0 的英特尔 E5(6 核、12 线程) 为什么这段代码 SIMD 编译比并行 SIMD 编译更快? for (int suppv = 0; suppv
OpenMP 4.0 引入了 SIMD 结构以利用 CPU 的 SIMD 指令。根据规范http://www.openmp.org/mp-documents/OpenMP4.0.0.pdf ,有两种结
英特尔编译器允许我们通过以下方式对循环进行矢量化 #pragma simd for ( ... ) 但是,您也可以选择使用 OpenMP 4 的指令执行此操作: #pragma omp simd fo
关注我的 x86 question ,我想知道如何在 Arm-v8 上有效地矢量化以下代码: static inline uint64_t Compress8x7bit(uint64_t x) {
Intel 提供了几个 SIMD 命令,它们似乎都对 128 位数据执行按位异或: _mm_xor_pd(__m128d, __m128d) _mm_xor_ps(__m128, __m128) _m
可以使用“位打包”技术压缩无符号整数:在一个无符号整数 block 中,只存储有效位,当一个 block 中的所有整数都“小”时,会导致数据压缩。该方法称为 FOR (引用框架)。 有SIMD lib
SSE 寄存器是否在逻辑处理器(超线程)之间共享或复制? 对于 SSE 繁重的程序,我能否期望从并行化中获得与普通程序相同的加速(英特尔声称具有超线程的处理器为 30%)? 最佳答案 从英特尔的文档中
我正在编写一个使用 SSE 指令来乘法和相加整数值的程序。我用浮点数做了同样的程序,但我的整数版本缺少一个指令。 使用浮点数,在完成所有操作后,我将 de 值返回到常规浮点数数组,执行以下操作: _m
我正在开发基于Intel指令集(AVX,FMA等)的高性能算法。当数据按顺序存储时,我的算法(内核)运行良好。但是,现在我面临一个大问题,但没有找到解决方法或解决方案: see 2D Matrix i
大家好 :) 我正在尝试了解有关浮点、SIMD/数学内在函数和 gcc 的快速数学标志的一些概念。更具体地说,我在 x86 cpu 上使用 MinGW 和 gcc v4.5.0。 我已经搜索了一段时间
根据https://sourceware.org/glibc/wiki/libmvec GCC 具有数学函数的向量实现。它们可以被编译器用于优化,可以在这个例子中看到:https://godbolt.
我是一名优秀的程序员,十分优秀!