gpt4 book ai didi

c++ - SSE1,2,3 round() 不完全遵循 std::round() 结果

转载 作者:行者123 更新时间:2023-12-05 04:31:42 28 4
gpt4 key购买 nike

我正在尝试使用 SSE(1,2,3) 指令创建类似于 std::round() 的函数,但某些值和/或逻辑运算符存在一些问题。这是我的代码:

#include <iostream>
#include <cmath>
#include <emmintrin.h>

int round_int( float x ) {
return (int) (x > 0.0f) ? (x + 0.5f) : (x - 0.5f);
}

__m128 roundf_sse(__m128 x){
__m128 zero = _mm_set1_ps(0.0f);
__m128 a = _mm_set1_ps(0.5f);
__m128 b = _mm_set1_ps(-0.5f);
__m128 cond = _mm_cmpgt_ps(x, zero);
__m128 val = _mm_or_ps(_mm_and_ps(a, cond), _mm_andnot_ps(cond, b));
return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, val)));

}

__m128 roundf_mp(__m128 x){
__m128i i = _mm_cvtps_epi32(x);
return _mm_cvtepi32_ps(i);
}

int main() {
for (int i = -10; i <= 10; i++){
for (int j = 0; j < 10; j++){
float x = (float)i + ((float)j/10.0f);

std::cout << "x = " << x << " ------------------------ " << std::endl;
std::cout << "std::round = " << std::round(x) << std::endl;
std::cout << "round_int = " << round_int(x) << std::endl;

float m128res[4] = { 0 };
__m128 in = _mm_set1_ps(x);

_mm_store_ps(m128res, roundf_sse(in));
std::cout << "roundf_sse = " << m128res[0] << std::endl;

_mm_store_ps(m128res, roundf_mp(in));
std::cout << "roundf_mp = " << m128res[0] << std::endl;
}
}
}

使用 Compiler Explorer 完成的一些测试 - https://godbolt.org/z/b5b5YqEKo

问题是:

a) roundf_mp() 函数,输入值如 ±6.5、±4.5、±2.5、±0.5 现在有错误的结果

b) roundf_sse() 函数:它尝试遵循函数 round_int 结构(round_int 结果等于 std::round() 输出)并且部分基于 Branchless “select” (cond ? a : b) section found from this posting .

有什么建议是a 情况下出现问题的原因吗?b 情况下是否有未正确实现的内容?

编辑:通过使用 _mm_cvttps_epi32 将 float 舍入为 int 我得到正确的舍入:

__m128 roundf_sse(__m128 x){ 
__m128 zero = _mm_set1_ps(0.0f);
__m128 a = _mm_set1_ps(0.5f);
__m128 b = _mm_set1_ps(-0.5f);
__m128 cond = _mm_cmpgt_ps(x, zero);
__m128 val = _mm_or_ps(_mm_and_ps(a, cond),_mm_andnot_ps(cond, b));

return _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val)));

会不会有瓶颈(控制某些特定值、限制等)?

最佳答案

在评论者的帮助下,我解决了问题,并且通过更改实现技术还提高了此功能的性能。有一个注意事项:此功能限制为 ± 2^23。对于超出此范围的值,可以通过在浮点到整数转换中使用 _mm_cvtps_epi32() 来扩展限制。

我的原始实现将 float 更改为整数转换内在:

__m128 roundf_sse(__m128 x){                // for |x|<2^23
__m128 zero = _mm_set1_ps(0.0f);
__m128 a = _mm_set1_ps(0.5f);
__m128 b = _mm_set1_ps(-0.5f);
__m128 cond = _mm_cmpgt_ps(x, zero);
__m128 val = _mm_or_ps(_mm_and_ps(a, cond),_mm_andnot_ps(cond, b));

return _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))); }

与 std::round() 进行相等舍入,但它比 std::round() (rdtsc/val) (GCC -O3 -ffast-math) 慢 ~60%。

建议(由 chtz)实现提供与 std::round() 相同的舍入但几乎相同的性能(rdtsc/val),并且与我的方法相比它还需要更少的代码:

__m128 roundf_sse(__m128 x){                // for |x|<2^23
__m128 val = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-0.0f), x), _mm_set1_ps(0.5f));
return _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val)));
}

编辑:

这个实现(实际上是它的三个版本)适用于整个范围:

__m128 round_M(__m128 x){               // for |x| >= 2^23
__m128 M = _mm_set1_ps(12582912.0); // "magic number [1.5*(2^24-8)]"
x = _mm_add_ps(x, M);
x = _mm_sub_ps(x, M);
return x;
}

__m128 roundf_sse(__m128 x){ // speed: 2.5x slower than std::round() (rdtsc/val)
__m128 SIGNMASK = _mm_set1_ps(-0.0f);
__m128 lim = _mm_set1_ps(0x1.0p23f);
__m128 val = _mm_or_ps(_mm_and_ps(SIGNMASK, x), _mm_set1_ps(0.5f));
__m128 cond = _mm_cmpge_ps(_mm_andnot_ps(SIGNMASK, x), lim);

// val = _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(_mm_cvtps_epi32(x)), cond), // for |x|=>2^23
// _mm_andnot_ps(cond, _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))))); // for |x| <2^23

// val = _mm_or_ps(_mm_and_ps(round_M(x), cond), // for |x|=>2^23
// _mm_andnot_ps(cond, _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))))); // for |x|<2^23

val = _mm_or_ps(_mm_and_ps(x, cond), // for |x|=>2^23
_mm_andnot_ps(cond, _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))))); // for |x|<2^23


return val;
}

但是,与 std::round() 相比,(rdtsc/val) 慢得多。

关于c++ - SSE1,2,3 round() 不完全遵循 std::round() 结果,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/71801608/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com