gpt4 book ai didi

c++ - 如何加快积分图像的计算?

转载 作者:塔克拉玛干 更新时间:2023-11-03 00:18:36 26 4
gpt4 key购买 nike

我经常需要计算积分图像。这是一个简单的算法:

uint32_t void integral_sum(const uint8_t * src, size_t src_stride, size_t width, size_t height, uint32_t * sum, size_t sum_stride)
{
memset(sum, 0, (width + 1) * sizeof(uint32_t));
sum += sum_stride + 1;
for (size_t row = 0; row < height; row++)
{
uint32_t row_sum = 0;
sum[-1] = 0;
for (size_t col = 0; col < width; col++)
{
row_sum += src[col];
sum[col] = row_sum + sum[col - sum_stride];
}
src += src_stride;
sum += sum_stride;
}
}

我有一个问题。我可以加快此算法的速度(例如,使用 SSE 或 AVX)吗?

最佳答案

算法中有一个令人讨厌的特征:图像中每个点的积分和取决于该行积分和的先前值。这种情况阻碍了算法的矢量化(使用 vector 指令,如 SSE 或 AVX)。但是使用特殊指令 vpsadbw (AVX2) or vpsadbw (AVX-512BW) 有一个技巧。

AVX2版本的算法:

void integral_sum(const uint8_t * src, size_t src_stride, size_t width, size_t height, uint32_t * sum, size_t sum_stride)
{
__m256i MASK = _mm_setr_epi64(0x00000000000000FF, 0x000000000000FFFF, 0x0000000000FFFFFF, 0x00000000FFFFFFFF);
__m256i PACK = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
__m256i ZERO = _mm256_set1_epi32(0);

memset(sum, 0, (width + 1)*sizeof(uint32_t));
sum += sum_stride + 1;
size_t aligned_width = width/4*4;

for(size_t row = 0; row < height; row++)
{
sum[-1] = 0;
size_t col = 0;
__m256i row_sums = ZERO;
for(; col < aligned_width; col += 4)
{
__m256i _src = _mm256_and_si256(_mm256_set1_epi32(*(uint32_t*)(src + col)), MASK);
row_sums = _mm256_add_epi32(row_sums, _mm256_sad_epu8(_src, ZERO));
__m128i curr_row_sums = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(row_sums, PACK));
__m128i prev_row_sums = _mm_loadu_si128((__m128i*)(sum + col - sum_stride));
_mm_storeu_si128((__m128i*)(sum + col), _mm_add_epi32(curr_row_sums, prev_row_sums));
row_sums = _mm256_permute4x64_epi64(row_sums, 0xFF);
}
uint32_t row_sum = sum[col - 1] - sum[col - sum_stride - 1];
for (; col < width; col++)
{
row_sum += src[col];
sum[col] = row_sum + sum[col - sum_stride];
}
src += src_stride;
sum += sum_stride;
}
}

这个技巧可以提高1.8倍的性能。

模拟使用AVX-512BW:

void integral_sum(const uint8_t * src, size_t src_stride, size_t width, size_t height, uint32_t * sum, size_t sum_stride)
{
__m512i MASK = _mm_setr_epi64(
0x00000000000000FF, 0x000000000000FFFF, 0x0000000000FFFFFF, 0x00000000FFFFFFFF
0xFFFFFFFFFFFFFFFF, 0x00FFFFFFFFFFFFFF, 0x0000FFFFFFFFFFFF, 0x000000FFFFFFFFFF);
__m512i K_15 = _mm512_set1_epi32(15);
__m512i ZERO = _mm512_set1_epi32(0);

memset(sum, 0, (width + 1)*sizeof(uint32_t));
sum += sum_stride + 1;
size_t aligned_width = width/8*8;

for(size_t row = 0; row < height; row++)
{
sum[-1] = 0;
size_t col = 0;
__m512i row_sums = ZERO;
for(; col < aligned_width; col += 8)
{
__m512i _src = _mm512_and_si512(_mm512_set1_epi32(*(uint32_t*)(src + col)), MASK);
row_sums = _mm512_add_epi512(row_sums, _mm512_sad_epu8(_src, ZERO));
__m256i curr_row_sums = _mm512_cvtepi64_epi32(row_sums);
__m256i prev_row_sums = _mm256_loadu_si256((__m256i*)(sum + col - sum_stride));
_mm_storeu_si128((__m128i*)(sum + col), _mm_add_epi32(curr_row_sums, prev_row_sums));
row_sums = _mm512_permutexvar_epi64(row_sums, K_15);
}
uint32_t row_sum = sum[col - 1] - sum[col - sum_stride - 1];
for (; col < width; col++)
{
row_sum += src[col];
sum[col] = row_sum + sum[col - sum_stride];
}
src += src_stride;
sum += sum_stride;
}
}

这个技巧可以提高 3.5 倍的性能。

附言原算法放在这里:AVX2AVX-512BW

关于c++ - 如何加快积分图像的计算?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/46520275/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com