c++ - shufps 比内存访问慢吗？-6ren

c++ - shufps 比内存访问慢吗？

转载作者：可可西里更新时间：2023-11-01 18:37:32

标题看似无稽之谈，但让我解释一下。前几天在研究一个程序的时候遇到了下面的汇编代码:

movaps  xmm3, xmmword ptr [rbp-30h]
lea     rdx, [rdi+1320h]
movaps  xmm5, xmm3
movaps  xmm6, xmm3
movaps  xmm0, xmm3
movss   dword ptr [rdx], xmm3
shufps  xmm5, xmm3, 55h
shufps  xmm6, xmm3, 0AAh
shufps  xmm0, xmm3, 0FFh
movaps  xmm4, xmm3
movss   dword ptr [rdx+4], xmm5
movss   dword ptr [rdx+8], xmm6
movss   dword ptr [rdx+0Ch], xmm0
mulss   xmm4, xmm3

而且它似乎主要是将四个 float 从 [rbp-30h] 复制到 [rdx]。这些 shufps 仅用于选择 xmm3 中的四个 float 之一(例如 shufps xmm5, xmm3, 55h 选择第二个 float 并将其放置在 xmm5 中)。

这让我想知道编译器是否这样做是因为 shufps 实际上比内存访问更快(类似于 movss xmm0, dword ptr [rbp-30h], movss dword ptr [rdx], xmm0).

所以我写了一些测试来比较这两种方法，发现 shufps 总是比多内存访问慢。现在我想也许 shufps 的使用与性能无关。它可能只是用来混淆代码，因此反编译器无法轻易生成干净的代码(用 IDA pro 尝试过，它确实过于复杂)。

虽然我可能永远不会在任何实际程序中显式地使用 shufps(例如通过使用 _mm_shuffle_ps)，因为编译器很可能比我更聪明，但我仍然想知道为什么编译程序的编译器会生成这样的代码。它既不快也不小。这没有意义。

无论如何，我将提供我在下面编写的测试。

#include <Windows.h>
#include <iostream>

using namespace std;

__declspec(noinline) DWORD profile_routine(void (*routine)(void *), void *arg, int iterations = 1)
{
    DWORD startTime = GetTickCount();
    while (iterations--)
    {
        routine(arg);
    }
    DWORD timeElapsed = GetTickCount() - startTime;
    return timeElapsed;
}


struct Struct
{
    float x, y, z, w;
};

__declspec(noinline) Struct shuffle1(float *arr)
{
    float x = arr[3];
    float y = arr[2];
    float z = arr[0];
    float w = arr[1];

    return {x, y, z, w};
}


#define SS0     (0x00)
#define SS1     (0x55)
#define SS2     (0xAA)
#define SS3     (0xFF)
__declspec(noinline) Struct shuffle2(float *arr)
{
    Struct r;
    __m128 packed = *reinterpret_cast<__m128 *>(arr);

    __m128 x = _mm_shuffle_ps(packed, packed, SS3);
    __m128 y = _mm_shuffle_ps(packed, packed, SS2);
    __m128 z = _mm_shuffle_ps(packed, packed, SS0);
    __m128 w = _mm_shuffle_ps(packed, packed, SS1);

    _mm_store_ss(&r.x, x);
    _mm_store_ss(&r.y, y);
    _mm_store_ss(&r.z, z);
    _mm_store_ss(&r.w, w);

    return r;
}



void profile_shuffle_r1(void *arg)
{
    float *arr = static_cast<float *>(arg);
    Struct q = shuffle1(arr);
    arr[0] += q.w;
    arr[1] += q.z;
    arr[2] += q.y;
    arr[3] += q.x;
}
void profile_shuffle_r2(void *arg)
{
    float *arr = static_cast<float *>(arg);
    Struct q = shuffle2(arr);
    arr[0] += q.w;
    arr[1] += q.z;
    arr[2] += q.y;
    arr[3] += q.x;
}

int main(int argc, char **argv)
{
    int n = argc + 3;
    float arr1[4], arr2[4];
    for (int i = 0; i < 4; i++)
    {
        arr1[i] = static_cast<float>(n + i);
        arr2[i] = static_cast<float>(n + i);
    }

    int iterations = 20000000;
    DWORD time1 = profile_routine(profile_shuffle_r1, arr1, iterations);
    cout << "time1 = " << time1 << endl;
    DWORD time2 = profile_routine(profile_shuffle_r2, arr2, iterations);
    cout << "time2 = " << time2 << endl;

    return 0;
}

在上面的测试中，我有两个洗牌方法 shuffle1 和 shuffle2 做同样的事情。当使用 MSVC -O2 编译时，它会生成以下代码:

shuffle1:
 mov         eax,dword ptr [rdx+0Ch]  
 mov         dword ptr [rcx],eax  
 mov         eax,dword ptr [rdx+8]  
 mov         dword ptr [rcx+4],eax  
 mov         eax,dword ptr [rdx]  
 mov         dword ptr [rcx+8],eax  
 mov         eax,dword ptr [rdx+4]  
 mov         dword ptr [rcx+0Ch],eax  
 mov         rax,rcx  
 ret  
shuffle2:
 movaps      xmm2,xmmword ptr [rdx]  
 mov         rax,rcx  
 movaps      xmm0,xmm2  
 shufps      xmm0,xmm2,0FFh  
 movss       dword ptr [rcx],xmm0  
 movaps      xmm0,xmm2  
 shufps      xmm0,xmm2,0AAh  
 movss       dword ptr [rcx+4],xmm0  
 movss       dword ptr [rcx+8],xmm2  
 shufps      xmm2,xmm2,55h  
 movss       dword ptr [rcx+0Ch],xmm2  
 ret

shuffle1 在我的机器上总是比 shuffle2 至少快 30%。我确实注意到 shuffle2 还有两条指令，而 shuffle1 实际上使用 eax 而不是 xmm0 所以我想如果我添加一些垃圾算术运算，结果会有所不同。

所以我将它们修改如下:

__declspec(noinline) Struct shuffle1(float *arr)
{
    float x0 = arr[3];
    float y0 = arr[2];
    float z0 = arr[0];
    float w0 = arr[1];

    float x = x0 + y0 + z0;
    float y = y0 + z0 + w0;
    float z = z0 + w0 + x0;
    float w = w0 + x0 + y0;

    return {x, y, z, w};
}


#define SS0     (0x00)
#define SS1     (0x55)
#define SS2     (0xAA)
#define SS3     (0xFF)
__declspec(noinline) Struct shuffle2(float *arr)
{
    Struct r;
    __m128 packed = *reinterpret_cast<__m128 *>(arr);

    __m128 x0 = _mm_shuffle_ps(packed, packed, SS3);
    __m128 y0 = _mm_shuffle_ps(packed, packed, SS2);
    __m128 z0 = _mm_shuffle_ps(packed, packed, SS0);
    __m128 w0 = _mm_shuffle_ps(packed, packed, SS1);

    __m128 yz = _mm_add_ss(y0, z0);
    __m128 x = _mm_add_ss(x0, yz);
    __m128 y = _mm_add_ss(w0, yz);

    __m128 wx = _mm_add_ss(w0, x0);
    __m128 z = _mm_add_ss(z0, wx);
    __m128 w = _mm_add_ss(y0, wx);

    _mm_store_ss(&r.x, x);
    _mm_store_ss(&r.y, y);
    _mm_store_ss(&r.z, z);
    _mm_store_ss(&r.w, w);

    return r;
}

现在程序集看起来更公平了，因为它们具有相同数量的指令并且都需要使用 xmm 寄存器。

shuffle1:
 movss       xmm5,dword ptr [rdx+8]  
 mov         rax,rcx  
 movss       xmm3,dword ptr [rdx+0Ch]  
 movaps      xmm0,xmm5  
 movss       xmm2,dword ptr [rdx]  
 addss       xmm0,xmm3  
 movss       xmm4,dword ptr [rdx+4]  
 movaps      xmm1,xmm2  
 addss       xmm1,xmm5  
 addss       xmm0,xmm2  
 addss       xmm1,xmm4  
 movss       dword ptr [rcx],xmm0  
 movaps      xmm0,xmm4  
 addss       xmm0,xmm2  
 addss       xmm4,xmm3  
 movss       dword ptr [rcx+4],xmm1  
 addss       xmm0,xmm3  
 addss       xmm4,xmm5  
 movss       dword ptr [rcx+8],xmm0  
 movss       dword ptr [rcx+0Ch],xmm4  
 ret  
shuffle2:
 movaps      xmm4,xmmword ptr [rdx]  
 mov         rax,rcx  
 movaps      xmm3,xmm4  
 movaps      xmm5,xmm4  
 shufps      xmm5,xmm4,0AAh  
 movaps      xmm2,xmm4  
 shufps      xmm2,xmm4,0FFh  
 movaps      xmm0,xmm5  
 addss       xmm0,xmm3  
 shufps      xmm4,xmm4,55h  
 movaps      xmm1,xmm4  
 addss       xmm1,xmm2  
 addss       xmm2,xmm0  
 addss       xmm4,xmm0  
 addss       xmm3,xmm1  
 addss       xmm5,xmm1  
 movss       dword ptr [rcx],xmm2  
 movss       dword ptr [rcx+4],xmm4  
 movss       dword ptr [rcx+8],xmm3  
 movss       dword ptr [rcx+0Ch],xmm5  
 ret

不过没关系。 shuffle1 仍然快 30%!

最佳答案

如果没有更广泛的上下文，很难确定，但是......在针对较新的处理器进行优化时，您必须考虑不同端口的使用情况。在这里见阿格纳斯:http://www.agner.org/optimize/instruction_tables.pdf

在这种情况下，虽然这看起来不太可能，但如果我们假设程序集实际上已经过优化，我会想到一些可能性。

这可能会出现在一段代码中，其中无序调度程序恰好有更多端口 5(例如，在 Haswell 上)可用，而不是端口 2 和 3(再次以 Haswell 为例)可用。
与 #1 类似，但在超线程时可能会观察到相同的效果。此代码可能旨在不窃取兄弟超线程的读取操作。
最后，针对这种优化以及我使用过类似内容的地方。假设您有一个运行时接近 100% 可预测的分支，但在编译时则不然。让我们想象一下，假设在分支之后有一次读取通常是缓存未命中。你想尽快阅读。如果您不使用读取端口，乱序调度程序将提前读取并开始执行该读取。这可以使 shufps 指令基本上“免费”执行。这是这个例子:
```
  MOV ecx, [some computed, mostly constant at run-time global]
 label loop:
  ADD rdi, 16
  ADD rbp, 16
  CALL shuffle
  SUB ecx, 1
  JNE loop

MOV rax, [rdi]

;do a read that could be "predicted" properly
MOV rbx, [rax]
```

不过，老实说，它看起来像是编写得不好的汇编或生成的机器代码，所以我不会考虑太多。我给出的例子非常不可能。

关于c++ - shufps 比内存访问慢吗？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/42174929/

文章推荐： c++ - 为什么似乎从来没有人写过 `delete &someObj` ？

文章推荐： c++ - 将范围拆分为重叠范围的范围

javascript - php 访问 mqsql 或 html 访问 json 或 html 访问 xml ？哪个更快？
关闭。这个问题是opinion-based 。目前不接受答案。想要改进这个问题吗？更新问题，以便 editing this post 可以用事实和引文来回答它。 . 已关闭 4 年前。 Improv
powershell - API 访问 PowerShell Web 访问？
PowerShell Web Access 允许您通过 Web 浏览器运行 PowerShell cmdlet。它显示了一个基于 Web 的控制台窗口。有没有办法运行 cmdlet 而无需在控制台窗
c# - 如何使用应用程序级身份验证/访问 token 访问 Sharepoint 文件？
我尝试在无需用户登录的情况下访问 Sharepoint 文件。我可以通过以下任一方式获取访问 token 方法一: var client = new RestClient("https://logi
soap - 使用 OAuth 访问 token 访问 SOAP 服务？
我目前正在尝试通过 Chrome 扩展程序访问 Google 服务。我的理解是，对于 JS 应用程序，Google 首选的身份验证机制是 OAuth。我的应用目前已成功通过 OAuth 向服务进行身份
C++ - 允许通过基类(接口(interface))访问，禁止通过派生类(具体实现)访问？
假设我有纯抽象类 IHandler 和派生自它的类: class IHandler { public: virtual int process_input(char input) = 0; };
css - 可以通过 URL 访问 CSS 文件，但不能从 HTML 访问
我有一个带有 ThymeLeaf 和 Dojo 的 Spring 应用程序，这给我带来了问题。当我从我的 HTML 文件中引用 CSS 文件时，它们在 Firebug 中显示为中止。但是，当我通过在地
javascript - 为什么我可以用 [val] 访问 js 对象，但不能用 .val 访问？
这个问题已经有答案了: JavaScript property access: dot notation vs. brackets? (17 个回答) 已关闭 6 年前。为什么这不起作用？ func
.htaccess - 仅允许通过 http 访问 robot.txt，其他通过 https 访问
我想将所有流量重定向到 https，只有 robot.txt 应该可以通过 http 访问。是否可以为 robot.txt 文件创建异常(exception)？我的 .htaccess 文件: R
oauth-2.0 - 无法使用有效的 oauth2 访问 token 访问 Linkedin 个人资料
我遇到了 LinkedIn OAuth2: "Unable to verify access token" 中描述的相同问题;但是，那里描述的解决方案并不能解决我的问题。我能够成功请求访问 toke
Docker 容器不能通过 localhost 访问，但可以通过 127.0.0.1 访问
问题我有一个暴露给 *:8080 的 Docker 服务容器. 我无法通过 localhost:8080 访问容器. Chrome /curl无限期挂断。但是如果我使用任何其他本地IP，我就可以访
python - 使用 OAuth 2.0 访问 token 访问 Gmail Imap
我正在使用 Google 的 Oauth 2.0 来获取用户的 access_token，但我不知道如何将它与 imaplib 一起使用来访问收件箱。最佳答案下面是带有 oauth 2.0 的 I
curl - 可以从 curl 访问 docker 服务，但不能从 postman/chrome 访问
我正在做 docker 入门指南:https://docs.docker.com/get-started/part3/#recap-and-cheat-sheet-optional docker-co
azure - 带有 Nginx 的 AKS 无法通过 IP 访问，只能通过 DNS 访问
我正在尝试使用静态 IP 在 AKS 上创建一个 Web 应用程序，自然找到了一个带有 Nginx ingress controller in Azure's documentation 的解决方案。
javascript - 为什么可以将 'module.exports' 作为 'exports' 访问，但不能使用 'module.id' 访问？
这是我在名为 foo.js 的文件中的代码。 console.log('module.exports:', module.exports) console.log('module.id:', modu
amazon-web-services - aws 访问 key ID 和 secret 访问 key
我试图理解访问键。我读过https://docs.aws.amazon.com/general/latest/gr/aws-sec-cred-types.html#access-keys-and-se
xcode - 从 iOS 5 访问 Twitter 时 OAuth 访问 token 失败
我正在使用 MGTwitterEngine"将 twitter 集成到我的应用程序中。它在 iOS 4.2 上运行良好。当我尝试从任何 iOS 5 设备访问 twitter 时，我遇到了身份验证 to
amazon-web-services - aws 访问 key ID 和 secret 访问 key
我试图理解访问键。我读过https://docs.aws.amazon.com/general/latest/gr/aws-sec-cred-types.html#access-keys-and-se
ios - 如果 Facebook 访问 token 过期，会生成新的 Facebook 访问 token 吗？
我正在使用以下 API 列出我的 Facebook 好友。 https://graph.facebook.com/me/friends?access_token= ??? 我想知道访问 token 过
google-app-engine - 尝试使用 API key 访问 BigQuery 时出错(简单 API 访问)
401 Unauthorized - Show headers - { "error": { "errors": [ { "domain": "global", "reas
django - 从 heroku 访问 s3 内容时，AWS 访问 key 显示在浏览器 url 中
我已经将我的 django 应用程序部署到 heroku 并使用 Amazon s3 存储桶存储静态文件，我发现从 s3 存储桶到 heroku 获取数据没有问题。但是，当我测试查看内容存储位置时，除

可可西里

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c++ - shufps 比内存访问慢吗？