gpt4 book ai didi

c - _mm_storeu_si128 花费太多时间?

转载 作者:行者123 更新时间:2023-11-30 19:40:41 33 4
gpt4 key购买 nike

这是一个 C 函数,它获取 src 的权重值并将其存储到 dst 中。

static int _medium_c( DCTELEM * src, int index, int *dst )
{
int i;
//get weighted value
for( i = 0; i < 16; i++ )
{
unsigned int threshold1 = threshold[index][i];//threshold contains constant value
unsigned int threshold2 = ( threshold1<<1 );
int level= src[i];
if( ( ( unsigned )( level+threshold1 ) ) > threshold2 )
{
if( ( ( unsigned )( level+2*threshold1 ) ) > 2*threshold2 )
{
dst[i] = level * factor[i];
}
else
{
if( level>0 )
{
dst[i] = 2*( level - ( int )threshold1 ) * factor[i];
}
else
{
dst[i] = 2*( level + ( int )threshold1 ) * factor[i];
}
}
}
}
return 0;
}

内在版本是:

int medium_intrinsic16( DCTELEM * src, int index, int* dst )
{
int i, j = 0, c[16], k = 0;
for( j = 0;j < 2;j++ )
{
__m128i zero128 = _mm_setzero_si128();
__m128i mask = _mm_set_epi8( 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,0x0d, 0x0c,0x09,0x08,0x05,0x04,0x01,0x00 );
__m128i factor_a = _mm_loadu_si128 ( (__m128i*)&factor[8*j] );
factor_a = _mm_shuffle_epi8( factor_a, mask);
__m128i factor_b = _mm_loadu_si128 ( (__m128i*)&factor[8*j+4] );
factor_b = _mm_shuffle_epi8( factor_b, mask);
factor_a = _mm_unpacklo_epi64( factor_a, factor_b );

__m128i level_a = _mm_loadu_si128( (__m128i*)&src[8*j] );

__m128i threshold1_a = _mm_loadu_si128((__m128i*)&threshold[index][8*j] );
threshold1_a = _mm_shuffle_epi8( threshold1_a, mask);
__m128i threshold1_b = _mm_loadu_si128((__m128i*)&threshold[index][8*j+4] );
threshold1_b = _mm_shuffle_epi8( threshold1_b, mask);
threshold1_a = _mm_unpacklo_epi64( threshold1_a, threshold1_b );
__m128i threshold2_a = _mm_slli_epi32( threshold1_a, 1 );

__m128i mif = _mm_cmpgt_epi16( level_a, zero128 );
//keep
__m128i m0 = _mm_sub_epi16( level_a, threshold1_a );//( level - ( int )threshold1 )
__m128i m1 = _mm_add_epi16( level_a, threshold1_a );//( level + ( int )threshold1 )
__m128i m2 = _mm_slli_epi16( factor_a, 1);

__m128i m3 = _mm_mullo_epi16( m0, m2 );//2*( level - ( int )threshold1 ) * factor[i];
__m128i m4 = _mm_mulhi_epi16( m0, m2 );//2*( level - ( int )threshold1 ) * factor[i];
__m128i m5 = _mm_mullo_epi16( m1, m2 );//2*( level + ( int )threshold1 ) * factor[i];
__m128i m6 = _mm_mulhi_epi16( m1, m2 );//2*( level + ( int )threshold1 ) * factor[i];

//keep
m3 = _mm_blendv_epi8( m5, m3, mif);
m4 = _mm_blendv_epi8( m6, m4, mif);

m0 = _mm_add_epi16( level_a, threshold2_a );//( level+2*threshold1 )
m1 = _mm_slli_epi16( threshold2_a, 1 );//2*threshold2
m2 = _mm_max_epu16( m0, m1 );
mif = _mm_cmpeq_epi16( m2, m0 );
m0 = _mm_mullo_epi16( level_a, factor_a );
m1 = _mm_mulhi_epi16( level_a, factor_a );

//keep
m0 = _mm_blendv_epi8( m3, m0, mif );
m1 = _mm_blendv_epi8( m4, m1, mif );

m2 = _mm_add_epi16( level_a, threshold1_a );
m3 = _mm_max_epu16( m2, threshold2_a );
mif = _mm_cmpeq_epi16( m3, m2);

m0 = _mm_and_si128( mif, m0 );
m1 = _mm_and_si128( mif, m1 );

m2 = _mm_unpacklo_epi16( m0, m1 );
m3 = _mm_unpackhi_epi16( m0, m1 );
_mm_storeu_si128((__m128i*)&dst[8*j] , m2 );//will run fast if removed
_mm_storeu_si128((__m128i*)&dst[8*j+4], m3 );//will run fast if removed
}
return 0;
}

内在版本并不比 C 版本更快。问题是,如果我删除 for 循环的最后两行,如代码中所示, _mm_storeu_si128((__m128i*)&dst[8*j] , m2)_mm_storeu_si128((__m128i*)&dst[8*j+4], m3),内在版本的运行速度明显比 c 版本快(大约快 4 倍)。谁能解释为什么会发生这种情况? _mm_storeu_si128() 花费这么多时间吗?谢谢

最佳答案

如果它的速度与 C 版本相同,那么您可能会遇到内存带宽瓶颈。在这种情况下,是的,存储到内存是算法中最昂贵的事情。

或者当结果没有存储在任何地方时,编译器可能会优化掉大量代码!您必须查看汇编以确保它只是省略了存储指令,而不是优化了大部分功能。

参见http://agner.org/optimize/ ,以及其他链接 https://stackoverflow.com/tags/x86/info (尤其是 Ulrich Drepper 关于缓存的论文。)

研究缓存阻塞,也称为循环平铺。

关于c - _mm_storeu_si128 花费太多时间?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/35081696/

33 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com