gpt4 book ai didi

c - 高效的 SSE FP `floor()`/`ceil()`/`round()` 没有 SSE4.1 的舍入函数?

转载 作者:太空宇宙 更新时间:2023-11-03 23:58:41 28 4
gpt4 key购买 nike

如何像这些函数一样将 float 的 __m128 vector 向上/向下舍入或舍入到最接近的整数?

我需要在没有 SSE4.1 roundps (_mm_floor_ps/_mm_ceil_ps/_mm_round_ps (x, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)roundps 也可以截断为零,但我不需要这个应用程序。

我可以使用 SSE3 和更早版本。 (没有 SSSE3 或 SSE4)

所以函数声明应该是这样的:

__m128 RoundSse( __m128 x )__m128 CeilSse( __m128 x )__m128 FloorSse( __m128 x )

最佳答案

我正在发布来自 http://dss.stephanierct.com/DevBlog/?p=8 的代码:

它应该采用按值形式(我只是从代码中删除了&,不确定是否可以):

static inline __m128 FloorSse(const __m128 x) {
__m128i v0 = _mm_setzero_si128();
__m128i v1 = _mm_cmpeq_epi32(v0, v0);
__m128i ji = _mm_srli_epi32(v1, 25);
__m128i tmp = _mm_slli_epi32(ji, 23); // I edited this (Added tmp) not sure about it
__m128 j = _mm_castsi128_ps(tmp); //create vector 1.0f // I edited this not sure about it
__m128i i = _mm_cvttps_epi32(x);
__m128 fi = _mm_cvtepi32_ps(i);
__m128 igx = _mm_cmpgt_ps(fi, x);
j = _mm_and_ps(igx, j);
return _mm_sub_ps(fi, j);
}

static inline __m128 CeilSse(const __m128 x) {
__m128i v0 = _mm_setzero_si128();
__m128i v1 = _mm_cmpeq_epi32(v0, v0);
__m128i ji = _mm_srli_epi32(v1, 25);
__m128i tmp = _mm_slli_epi32(ji, 23); // I edited this (Added tmp) not sure about it
__m128 j = _mm_castsi128_ps(tmp); //create vector 1.0f // I edited this not sure about it
__m128i i = _mm_cvttps_epi32(x);
__m128 fi = _mm_cvtepi32_ps(i);
__m128 igx = _mm_cmplt_ps(fi, x);
j = _mm_and_ps(igx, j);
return _mm_add_ps(fi, j);
}

static inline __m128 RoundSse(const __m128 a) {
__m128 v0 = _mm_setzero_ps(); //generate the highest value < 2
__m128 v1 = _mm_cmpeq_ps(v0, v0);
__m128i tmp = _mm_castps_si128(v1); // I edited this (Added tmp) not sure about it
tmp = _mm_srli_epi32(tmp, 2); // I edited this (Added tmp) not sure about it
__m128 vNearest2 = _mm_castsi128_ps(tmp); // I edited this (Added tmp) not sure about it
__m128i i = _mm_cvttps_epi32(a);
__m128 aTrunc = _mm_cvtepi32_ps(i); // truncate a
__m128 rmd = _mm_sub_ps(a, aTrunc); // get remainder
__m128 rmd2 = _mm_mul_ps(rmd, vNearest2); // mul remainder by near 2 will yield the needed offset
__m128i rmd2i = _mm_cvttps_epi32(rmd2); // after being truncated of course
__m128 rmd2Trunc = _mm_cvtepi32_ps(rmd2i);
__m128 r = _mm_add_ps(aTrunc, rmd2Trunc);
return r;
}


inline __m128 ModSee(const __m128 a, const __m128 aDiv) {
__m128 c = _mm_div_ps(a, aDiv);
__m128i i = _mm_cvttps_epi32(c);
__m128 cTrunc = _mm_cvtepi32_ps(i);
__m128 base = _mm_mul_ps(cTrunc, aDiv);
__m128 r = _mm_sub_ps(a, base);
return r;
}

关于c - 高效的 SSE FP `floor()`/`ceil()`/`round()` 没有 SSE4.1 的舍入函数?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54022478/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com