c++ - SSE1,2,3 round() 不完全遵循 std::round() 结果-6ren

c++ - SSE1,2,3 round() 不完全遵循 std::round() 结果

转载作者：行者123 更新时间：2023-12-05 04:31:42

28

4

我正在尝试使用 SSE(1,2,3) 指令创建类似于 std::round() 的函数，但某些值和/或逻辑运算符存在一些问题。这是我的代码:

#include <iostream>
#include <cmath>
#include <emmintrin.h>

int round_int( float x ) {
    return (int) (x > 0.0f) ? (x + 0.5f) : (x - 0.5f);
}

__m128 roundf_sse(__m128 x){ 
    __m128  zero    = _mm_set1_ps(0.0f);
    __m128  a       = _mm_set1_ps(0.5f);
    __m128  b       = _mm_set1_ps(-0.5f);
    __m128  cond    = _mm_cmpgt_ps(x, zero);
    __m128  val     = _mm_or_ps(_mm_and_ps(a, cond), _mm_andnot_ps(cond, b));
    return  _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, val))); 

}
 
__m128 roundf_mp(__m128 x){  
        __m128i i = _mm_cvtps_epi32(x);
        return _mm_cvtepi32_ps(i);
}
 
int main() {
    for (int i = -10; i <= 10; i++){
        for (int j = 0; j < 10; j++){
        float x = (float)i + ((float)j/10.0f);
        
        std::cout << "x = " << x << "   ------------------------ " << std::endl;
        std::cout << "std::round = " << std::round(x) << std::endl; 
        std::cout << "round_int  = " << round_int(x) << std::endl;
        
        float m128res[4] = { 0 };
        __m128 in = _mm_set1_ps(x);
        
        _mm_store_ps(m128res, roundf_sse(in));
        std::cout << "roundf_sse = " << m128res[0] << std::endl;

        _mm_store_ps(m128res, roundf_mp(in));
        std::cout << "roundf_mp  = " << m128res[0] << std::endl;
        }
    }
}

使用 Compiler Explorer 完成的一些测试 - https://godbolt.org/z/b5b5YqEKo

问题是:

a) roundf_mp() 函数，输入值如 ±6.5、±4.5、±2.5、±0.5 现在有错误的结果

和

b) roundf_sse() 函数:它尝试遵循函数 round_int 结构(round_int 结果等于 std::round() 输出)并且部分基于 Branchless “select” (cond ? a : b) section found from this posting .

有什么建议是a 情况下出现问题的原因吗？b 情况下是否有未正确实现的内容？

编辑:通过使用 _mm_cvttps_epi32 将 float 舍入为 int 我得到正确的舍入:

__m128 roundf_sse(__m128 x){ 
    __m128  zero    = _mm_set1_ps(0.0f);
    __m128  a       = _mm_set1_ps(0.5f);
    __m128  b       = _mm_set1_ps(-0.5f);
    __m128  cond    = _mm_cmpgt_ps(x, zero);
    __m128  val     = _mm_or_ps(_mm_and_ps(a, cond),_mm_andnot_ps(cond, b));

    return  _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val)));

会不会有瓶颈(控制某些特定值、限制等)？

最佳答案

在评论者的帮助下，我解决了问题，并且通过更改实现技术还提高了此功能的性能。有一个注意事项:此功能限制为 ± 2^23。对于超出此范围的值，可以通过在浮点到整数转换中使用 _mm_cvtps_epi32() 来扩展限制。

我的原始实现将 float 更改为整数转换内在:

__m128 roundf_sse(__m128 x){                // for |x|<2^23
    __m128  zero    = _mm_set1_ps(0.0f);
    __m128  a       = _mm_set1_ps(0.5f);
    __m128  b       = _mm_set1_ps(-0.5f);
    __m128  cond    = _mm_cmpgt_ps(x, zero);
    __m128  val     = _mm_or_ps(_mm_and_ps(a, cond),_mm_andnot_ps(cond, b));

    return  _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))); }

与 std::round() 进行相等舍入，但它比 std::round() (rdtsc/val) (GCC -O3 -ffast-math) 慢 ~60%。

建议(由 chtz)实现提供与 std::round() 相同的舍入但几乎相同的性能(rdtsc/val)，并且与我的方法相比它还需要更少的代码:

__m128 roundf_sse(__m128 x){                // for |x|<2^23
    __m128 val = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-0.0f), x), _mm_set1_ps(0.5f));
    return _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val)));
}

编辑:

这个实现(实际上是它的三个版本)适用于整个范围:

__m128 round_M(__m128 x){               // for |x| >= 2^23
    __m128 M = _mm_set1_ps(12582912.0); // "magic number [1.5*(2^24-8)]"
            x = _mm_add_ps(x, M);
            x = _mm_sub_ps(x, M);
    return x;
}

__m128 roundf_sse(__m128 x){            // speed: 2.5x slower than std::round() (rdtsc/val)
    __m128  SIGNMASK    = _mm_set1_ps(-0.0f);
    __m128  lim         = _mm_set1_ps(0x1.0p23f);
    __m128  val         = _mm_or_ps(_mm_and_ps(SIGNMASK, x), _mm_set1_ps(0.5f));
    __m128  cond        = _mm_cmpge_ps(_mm_andnot_ps(SIGNMASK, x), lim);

//            val      = _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(_mm_cvtps_epi32(x)), cond),  // for |x|=>2^23
//                            _mm_andnot_ps(cond, _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))))); // for |x| <2^23

//            val      = _mm_or_ps(_mm_and_ps(round_M(x), cond),  // for |x|=>2^23
//                            _mm_andnot_ps(cond, _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))))); //  for |x|<2^23

            val      =  _mm_or_ps(_mm_and_ps(x, cond),  //  for |x|=>2^23
                            _mm_andnot_ps(cond, _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))))); //  for |x|<2^23


    return val;
}

但是，与 std::round() 相比，(rdtsc/val) 慢得多。

关于c++ - SSE1,2,3 round() 不完全遵循 std::round() 结果，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/71801608/

28

4

0

文章推荐： Ruby 3.0.0 似乎不使用 jemalloc

文章推荐： reactjs - 防止重新渲染 flatlist React Native

文章推荐： r - 如何对 data.frames 列表进行子集化？

c++ - 为什么 `std::common_type_t` 等于 `std::ostream` 而不是 `std::ostream &` ？
我正在开发一个小型图书馆，我需要做的一件事是让访问者访问一些数据并返回结果。在一些较旧的 C++ 代码中，访问者需要声明一个 typedef return_type .例如，boost::stati
c++ - std::map 麻烦
我正在尝试使用std:map类型的键和值制作std::any Visual Studio 2017 std::map m("lastname", "Ivanov"); std::cout (m["la
C++ std::map> 。如何循环设定值？
我已经在 C++ 的 map 中声明了一个集合为 std::map> .如何循环访问或打印设定值？最佳答案如果你知道如何迭代 std::map或 std::set单独地，您应该可以毫无问题地组合迭
C++ 循环 std::vector>
如何循环？我已经试过了: //----- code std::vector >::iterator it; for ( it = users.begin(); it != users.end();
c++ - std::unique_lock 还是 std::lock_guard？
我有两个用例。 A.我想同步访问两个线程的队列。 B.我想同步两个线程对队列的访问并使用条件变量，因为其中一个线程将等待另一个线程将内容存储到队列中。对于用例 A，我看到了使用 std::lock_
c++ - std::trivially_copyable_v 和 std::is_pod_v 之间有什么区别(std::is_standard_layout && std::is_trivial_v)
我正在查看这两种类型特征的文档，但不确定有什么区别。我不是语言律师，但据我所知，它们都适用于“memcpy-able”类型。它们可以互换使用吗？最佳答案不，这些术语不能互换使用。这两个术语都表示
c++ - 为什么我可以有一个 std::vector 而不是 std::vector？
我有以下测试代码，其中有一个参数 fS，它是 ofstream 的容器: #include #include #include #include int
c++ - std::unordered_map
这是这个问题的延续 c++ function ptr in unorderer_map, compile time error 我试图使用 std::function 而不是函数指针，并且只有当函数是

c++ - 将 std::any_of、std::all_of、std::none_of 等与 std::map 一起使用
std::unordered_map str_bool_map = { {"a", true}, {"b", false}, {"c", true} }; 我们可以在此映射上使
c++ - 使用 std::find 检查 std::vector> 中的项目
我有以下对象 std::vector> vectorList; 然后我添加到这个使用 std::vector vec_tmp; vec_tmp.push_back(strDRG); vec_tmp.p
c++ - 为什么 std::initializer_list 不支持 std::get<>、std::tuple_size 和 std::tuple_element
为什么 std::initializer_list不支持std::get<> , std::tuple_size和 std::tuple_element ？在constexpr中用得很多现在的表达式，
c++ - std::tuple 和 std::tuple 是否被 std::vector 视为同一类型？
我有一个像这样定义的变量 auto drum = std::make_tuple ( std::make_tuple ( 0.3f , Ex
c++ :将 std::map 转换为 std::map
假设我有一个私有(private)std::map在我的类(class)里std::map 。我怎样才能将其转换为std::map返回给用户？我想要下面的原型(prototype) const std
c++ :将 std::map 转换为 std::map
假设我有一个私有(private)std::map在我的类(class)里std::map 。我怎样才能将其转换为std::map返回给用户？我想要下面的原型(prototype) const std
c++ - 在带有 std::ref 的 std::thread 中使用地址清理调用 std::invoke(std::forward(...)) 时的奇怪行为
问题我正在尝试将 lambda 闭包传递给 std::thread，它使用任意封闭参数调用任意封闭函数。 template std::thread timed_thread(Function&& f
c++ - 具有模板模板参数的模板定义，可以专门化为类，例如，std::vector 或 std::map
我想创建一个模板类，可以容纳容器和容器的任意组合。例如，std::vector或 std::map ，例如。我尝试了很多组合，但我必须承认模板的复杂性让我不知所措。我编译的关闭是这样的: templ
c++ - 将 std::vector> 分配给另一个 std::vector>
我有一个 std::vector>我将其分配给相同类型的第二个 vector 。我收到这个编译器错误: /opt/gcc-8.2.0/include/c++/8.2.0/bits/stl_algob
c++ - 将 std::vector> 移动到 std::vector>
有时候，我们有一个工厂可以生成一个 std::unique_ptr vector ，后来我们想在类/线程/你命名的之间共享这些指针。因此，最好改用 std::shared_ptr 。当然有一种方法可以
c++ - 为什么 std::sort 假定 std::vector< std::vector> 默认为 std::vector，从而产生错误的结果？
这个问题在这里已经有了答案: Sorting a vector of custom objects (14 个答案) 关闭 6 年前。我创建了一个 vector vector ，我想根据我定义的参
c++ - 将 std::vector> 转换为 std::vector>
我有三个类(class)成员: public: std::vector > getObjects(); std::vector > getObjects() const; privat

首页

博学

6Ren·AI

商城

c++ - SSE1,2,3 round() 不完全遵循 std::round() 结果