performance - 为什么在 AMD Zen 上整数除法吞吐量与较大值的差异很小？-6ren

performance - 为什么在 AMD Zen 上整数除法吞吐量与较大值的差异很小？

转载作者：行者123 更新时间：2023-12-05 01:52:52

我很好奇将两个具有不同数量的值位但恒定操作数(寄存器)位的值相除时的性能差异。我预计性能取决于股息的最高设置位，因为我假设 CPU 内部的迭代减法和移位“算法”可能在一个时钟周期内进行多次迭代。

所以我写了一个 C++20 小程序来测试这个:

#include <iostream>
#include <iostream>
#include <type_traits>
#include <cstdint>
#include <random>
#include <limits>
#include <atomic>
#include <chrono>
#include <vector>
#include <sstream>

using namespace std;
using namespace chrono;

int main()
{
    constexpr size_t ROUNDS = 100'000'000;
    auto ssp = []<typename TOp, typename TValue>() -> double
        requires is_unsigned_v<TOp> && is_unsigned_v<TValue> && (sizeof(TOp) >= sizeof(TValue))
    {
        constexpr size_t N = 0x1000;
        vector<TOp> vDividends( N ), vDivisors( N );
        mt19937_64 mt;
        uniform_int_distribution<unsigned> uidBits( 0, sizeof(TValue) * CHAR_BIT - 1 );
        uniform_int_distribution<uint64_t> uidValue( 0, numeric_limits<TValue>::max() );
        auto getMaxBitsValue = [&]() -> TOp
        {
            for( TValue value; ; )
                if( (make_signed_t<TValue>)(value = (TValue)uidValue( mt )) < 0 )
                    return value;
        };
        auto getDividend = [&]() -> TOp
        {
            return getMaxBitsValue() >> uidBits( mt );
        };
        auto getDivisor = [&]( TOp dividend ) -> TOp
        {
            for( TOp divisor; ; )
                if( (divisor = getMaxBitsValue() >> uidBits( mt )) <= dividend )
                    return divisor;
        };
        for( size_t i = 0; i != N; ++i )
            vDivisors[i] = getDivisor( (vDividends[i] = getDividend()) );
        auto start = high_resolution_clock::now();
        for( size_t r = ROUNDS; r--; )
            sum += vDividends[r % N] / vDivisors[r % N];
        double ns = (int64_t)duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count() / (double)ROUNDS;
        return ns;
    };
    auto results = []( double ns, double nsRef )
    {
        ostringstream oss;
        oss << ns;
        if( nsRef > 0.0 )
            oss << " (" << (int)(ns / nsRef * 100.0 + 0.5) << "%)";
        return oss.str();
    };
    double nsRef;
    cout << "8v, 8op: " << results( (nsRef = ssp.template operator ()<uint8_t, uint8_t>()), 0.0 ) << endl;
    cout << "8v, 16op: " << results( ssp.template operator ()<uint16_t, uint8_t>(), nsRef ) << endl;
    cout << "8v, 32op: " << results( ssp.template operator ()<uint32_t, uint8_t>(), nsRef ) << endl;
    cout << "8v, 64op: " << results( ssp.template operator ()<uint64_t, uint8_t>(), nsRef ) << endl;
    cout << "16v, 16op: " << results( ssp.template operator ()<uint16_t, uint16_t>(), nsRef ) << endl;
    cout << "16v, 32op: " << results( ssp.template operator ()<uint32_t, uint16_t>(), nsRef ) << endl;
    cout << "16v, 64op: " << results( ssp.template operator ()<uint64_t, uint16_t>(), nsRef ) << endl;
    cout << "32v, 32op: " << results( ssp.template operator ()<uint32_t, uint32_t>(), nsRef ) << endl;
    cout << "32v, 64op: " << results( ssp.template operator ()<uint64_t, uint32_t>(), nsRef ) << endl;
    cout << "64v, 64op: " << results( ssp.template operator ()<uint64_t, uint64_t>(), nsRef ) << endl;
}

对于具有不同操作数位的相同数量的值位，结果是相同的，具有轻微的测量误差。但是，在我的计算机上，将一个 64 位值除以另一个 64 位值所花费的时间仅比将一个 8 位值除以另一个 8 位值多花费大约 50% 的 CPU 架构原因(Ryzen Threadripper 3990X)？即使在配备 Ryzen 7 1800X 的 Linux 计算机上，百分比关系也大致相同。

我将其发布在“x86”、“x86-64”和“程序集”中而不是在 C++ 中，因为我问的不是 C++ 问题而是机器级问题。

[编辑]:这是带有一些序列化汇编代码 (MASM) 的 MSVC++ 代码的较新版本:

#include <iostream>
#include <iostream>
#include <type_traits>
#include <cstdint>
#include <random>
#include <limits>
#include <atomic>
#include <chrono>
#include <vector>
#include <sstream>

using namespace std;
using namespace chrono;

uint64_t divLoop( uint64_t *dividends, uint64_t *divisors, size_t n, size_t rounds );
uint32_t divLoop( uint32_t *dividends, uint32_t *divisors, size_t n, size_t rounds );
uint16_t divLoop( uint16_t *dividends, uint16_t *divisors, size_t n, size_t rounds );
uint8_t divLoop( uint8_t *dividends, uint8_t *divisors, size_t n, size_t rounds );

int main()
{
    constexpr size_t ROUNDS = 100'000'000;
    auto ssp = []<typename TOp, typename TValue>( TOp const &, TValue const & ) -> double
        requires is_unsigned_v<TOp> && is_unsigned_v<TValue> && (sizeof(TOp) >= sizeof(TValue))
    {
        constexpr size_t N = 0x1000;
        vector<TOp> vDividends( N ), vDivisors( N );
        mt19937_64 mt;
        uniform_int_distribution<unsigned> uidBits( 0, sizeof(TValue) * CHAR_BIT - 1 );
        uniform_int_distribution<uint64_t> uidValue( 0, numeric_limits<TValue>::max() );
        auto getMaxBitsValue = [&]() -> TOp
        {
            for( TValue value; ; )
                if( (make_signed_t<TValue>)(value = (TValue)uidValue( mt )) < 0 )
                    return value;
        };
        auto getDividend = [&]() -> TOp
        {
            return getMaxBitsValue() >> uidBits( mt );
        };
        auto getDivisor = [&]( TOp dividend ) -> TOp
        {
            for( TOp divisor; ; )
                if( (divisor = getMaxBitsValue() >> uidBits( mt )) <= dividend )
                    return divisor;
        };
        for( size_t i = 0; i != N; ++i )
            vDivisors[i] = getDivisor( (vDividends[i] = getDividend()) );
        TOp sum = 0;
        atomic<TOp> aSum( 0 );
        auto start = high_resolution_clock::now();
        divLoop( &vDividends[0], &vDivisors[0], N, ROUNDS );
        double ns = (int64_t)duration_cast<nanoseconds>( high_resolution_clock::now() - start ).count() / (double)ROUNDS;
        aSum = sum;
        return ns;
    };
    auto results = []( double ns, double nsRef )
    {
        ostringstream oss;
        oss << ns;
        if( nsRef > 0.0 )
            oss << " (" << (int)(ns / nsRef * 100.0 + 0.5) << "%)";
        return oss.str();
    };
    double nsRef;
    cout << "8v, 8op: " << results( (nsRef = ssp( uint8_t(), uint8_t() )), 0.0 ) << endl;
    cout << "8v, 16op: " << results( ssp( uint16_t(), uint8_t() ), nsRef ) << endl;
    cout << "8v, 32op: " << results( ssp( uint32_t(), uint8_t() ), nsRef ) << endl;
    cout << "8v, 64op: " << results( ssp( uint64_t(), uint8_t() ), nsRef ) << endl;
    cout << "16v, 16op: " << results( ssp( uint16_t(), uint16_t() ), nsRef ) << endl;
    cout << "16v, 32op: " << results( ssp( uint32_t(), uint16_t() ), nsRef ) << endl;
    cout << "16v, 64op: " << results( ssp( uint64_t(), uint16_t() ), nsRef ) << endl;
    cout << "32v, 32op: " << results( ssp( uint32_t(), uint32_t() ), nsRef ) << endl;
    cout << "32v, 64op: " << results( ssp( uint64_t(), uint32_t() ), nsRef ) << endl;
    cout << "64v, 64op: " << results( ssp( uint64_t(), uint64_t()  ), nsRef ) << endl;
}

马斯克:

; uint64_t divLoop( uint64_t *dividends, uint64_t *divisors, size_t n, size_t rounds );
PUBLIC ?divLoop@@YA_KPEA_K0_K1@Z
; uint32_t divLoop( uint32_t *dividends, uint32_t *divisors, size_t n, size_t rounds );
PUBLIC ?divLoop@@YAIPEAI0_K1@Z
; uint16_t divLoop( uint16_t *dividends, uint16_t *divisors, size_t n, size_t rounds );
PUBLIC ?divLoop@@YAGPEAG0_K1@Z
; uint8_t divLoop( uint8_t *dividends, uint8_t *divisors, size_t n, size_t rounds );
PUBLIC ?divLoop@@YAEPEAE0_K1@Z

_TEXT SEGMENT

; rcx = dividends
; rdx = divisors
; r8 = n
; r9 = rounds

?divLoop@@YA_KPEA_K0_K1@Z PROC
    push    r12
    push    r13
    mov     r10, rdx
    sub     r13, r13
outerLoop:
    mov     r11, r8
    cmp     r8, r9
    cmova   r11, r9
    sub     r11, 1
    jc      byebye
innerLoop:
    mov     rax, [rcx + r11 * 8]
    mov     r12, [r10 + r11 * 8]
    mov     rdx, r13
    div     r12
    mov     r13, rax
    and     r13, 0
    sub     r11, 1
    jnc     innerLoop
    sub     r9, r8
    ja      outerLoop
byebye:
    pop     r13
    pop     r12
    ret
?divLoop@@YA_KPEA_K0_K1@Z ENDP

?divLoop@@YAIPEAI0_K1@Z PROC
    push    r12
    push    r13
    mov     r10, rdx
    sub     r13, r13
outerLoop:
    mov     r11, r8
    cmp     r8, r9
    cmova   r11, r9
    sub     r11, 1
    jc      byebye
innerLoop:
    mov     eax, [rcx + r11 * 4]
    mov     r12d, [r10 + r11 * 4]
    mov     edx, r13d
    div     r12d
    mov     r13d, eax
    and     r13d, 0
    sub     r11, 1
    jnc     innerLoop
    sub     r9, r8
    ja      outerLoop
byebye:
    pop     r13
    pop     r12
    ret
?divLoop@@YAIPEAI0_K1@Z ENDP

?divLoop@@YAGPEAG0_K1@Z PROC
    push    r12
    push    r13
    mov     r10, rdx
    sub     r13, r13
outerLoop:
    mov     r11, r8
    cmp     r8, r9
    cmova   r11, r9
    sub     r11, 1
    jc      byebye
innerLoop:
    mov     ax, [rcx + r11 * 2]
    mov     r12w, [r10 + r11 * 2]
    mov     dx, r13w
    div     r12w
    mov     r13w, ax
    and     r13w, 0
    sub     r11, 1
    jnc     innerLoop
    sub     r9, r8
    ja      outerLoop
byebye:
    pop     r13
    pop     r12
    ret
?divLoop@@YAGPEAG0_K1@Z ENDP

?divLoop@@YAEPEAE0_K1@Z PROC
    push    r12
    push    r13
    mov     r10, rdx
    sub     r13, r13
outerLoop:
    mov     r11, r8
    cmp     r8, r9
    cmova   r11, r9
    sub     r11, 1
    jc      byebye
innerLoop:
    mov     al, [rcx + r11]
    mov     r12b, [r10 + r11]
    mov     dl, r13b
    mov     ah, dl
    div     r12b
    mov     r13b, al
    and     r13b, 0
    sub     r11, 1
    jnc     innerLoop
    sub     r9, r8
    ja      outerLoop
byebye:
    pop     r13
    pop     r12
    ret
?divLoop@@YAEPEAE0_K1@Z ENDP

_TEXT ENDS

END

现在这段代码完美地展示了除法不依赖于寄存器宽度而是操作数宽度。但我仍然在问自己，为什么 64 位除法的 64(操作数和寄存器)只比 8 位除法慢 1/3。

最佳答案

我假设您的构建类似于 GCC11.2 -O3 -std=gnu++20 https://godbolt.org/z/13YnGKPq9 ，除了加载和 div 之外不需要太多工作就可以创建一个循环。所以结果是真实的，没有隐藏在循环开销背后的更大差异(div 足够慢，可能不会成为内存瓶颈)。

您只测量 div 吞吐量，而不是延迟。 延迟比吞吐量的可变性更大(可能是红利？)，即使在最近的 CPU 上也是如此。

现代 CPU 通常具有接近恒定的吞吐量，即使延迟仍然可变也是如此。

https://electronics.stackexchange.com/questions/280673/why-does-hardware-division-take-much-longer-than-multiplication/280709#280709 - 现代快速除法器从初始猜测(从查找表)开始，然后用 Newton-Raphson 对其进行改进(使用硬件乘法器作为除法单元的一部分)使事情变得更快。也许在每个阶段使用乘法器单元对其进行流水线处理以提供固定的吞吐量，并提前输出以根据数据提供更低的延迟。
另请参阅 The integer division algorithm of Intel's x86 processors和 How sqrt() of GCC works after compiled? Which method of root is used? Newton-Raphson? (在 Ice Lake 之前，英特尔在与 FP 相同的 div/sqrt 单元上运行整数除法)。此外，与 AMD 不同，Intel before Ice Lake used many more uops对于 64 位 div r64 比对于 32 位或更小的操作数大小。如果您在 Intel 上测试了您的基准测试，您会发现 uint64_t 即使使用较小的值也会使速度变得非常慢。

另请参阅关于 What is integer division heavily used for? 的评论中的讨论随着时间的推移，关于硬件分频器的改进，尽管它主要是关于除法的用例，以及现代 CPU 的晶体管预算如此之高，以至于它们不妨将其中一些放在分频单元中。

您的结果也符合其他人完成的指令吞吐量测量结果。

https://uops.info/在 Zen+ 上使用 ax=0/bl=1 和 ax=0x3301/bl=123 测试了 8 位 div reg8，发现吞吐量没有差异，总是大约 13 或 14 个周期。 (只有延迟差异，从 8 到 25 个周期。或者对于 div r64，从 8 到 41 个周期的延迟取决于值和哪个输入到哪个输出)。低于吞吐量的延迟只有在 dep 链中有其他指令将输出连接回输入时才能观察到，这对我来说仍然很奇怪。
Agner Fog报告 14-46 的 div r64 延迟和 14-45 的吞吐量。 uops.info confirms that , 但由于某种原因没有将不同的吞吐量放入主表中。 0/1“快速除法”确实每 14 个周期运行一次，但 0x343a9ed744556677:0000000000000085/0x75e6e44fccddeeff 的慢速除法需要 43 个周期。 (输入上 RDX=0 的最坏情况可能没那么糟糕；当前的 C++ 编译器从不使用 div 或 idiv 上半部分不是零扩展的，除了是扩展精度的一部分，例如 unsigned __int128。)
对于 AMD K10，他评论 div/idiv 性能:“取决于数量中的有效位股息的绝对值。查看 AMD 软件优化指南。“我不确定这是否仍然是现代 AMD 和 Intel CPU 中除法单元的扩展方式。
InstLatx64 还测试了一些不同的数据输入，以及将哪些输入与输出耦合回延迟，例如对于Zen1 Ryzen 7 1800X他们发现 div r64 的吞吐量范围为 14c 到 46c。

关于performance - 为什么在 AMD Zen 上整数除法吞吐量与较大值的差异很小？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/71420116/

文章推荐： r - 在 R 中创建随机树

文章推荐： html - 另一个可点击的 div 内的可点击的 div

performance - "performant"软件究竟是什么意思？
关闭。这个问题是opinion-based .它目前不接受答案。想改善这个问题吗？更新问题，以便可以通过 editing this post 用事实和引文回答问题. 8年前关闭。 Improve t
performance - 灿灿授权: Performance Issue
暂时忘记能力的定义，只关注能力的“检查”(使用“授权!”)，我看到 CanCan 添加了大约 400 毫秒，用于简单地检查用户是否具有特定的能力主题/模型。这是预期的吗(我假设不是)？或者，有没有可
performance - Swift 显式与推断类型 : Performance
我正在阅读有关 Swift 的教程 ( http://www.raywenderlich.com/74438/swift-tutorial-a-quick-start )，它预定义为不显式设置类型，因
performance - 编码优先级 : Performance, 可维护性、可重用性？
这主要是由于对 SQL 问题的回答。由于性能原因，有意省略了 UDF 和子查询。我没有包括可靠性并不是说它应该被视为理所当然，但代码必须工作。性能永远是第一位的吗？提供了许多以性能为主要优先事项的答
performance - Scala递归与循环: performance and runtime considerations
我已经编写了一个简单的测试平台来测量三种阶乘实现的性能:基于循环的，非尾递归的和尾递归的。 Surprisingly to me the worst performant was the loop o
performance - ui-performance 插件无法在开发模式下工作 (Grails)
我已将 ui-performance 插件应用到我的应用程序中。不幸的是，在开发模式下运行应用程序时它似乎不起作用。例如，我的 javascript 导入是用“vnull”版本呈现的。例如不会
performance - 编译 F# 引用 : performance?
我有一个我操作的 F# 引用(我在各处添加对象池以回收经常创建和删除的短期对象)。我想运行结果报价；现在我使用了 F# PowerPack，它提供了将引用转换为表达式树和委托(delegate)的方法
performance - Spark独立: SparklyR : Performance issues
我正在尝试在 Spark 服务器上运行 SparklyR 库中的机器学习算法。 1 个簇 8 核 24G内存 Ubuntu 16.04 星火2.2 独立配置 1名师傅/2名 worker 每个执行器的
performance - 架构和索引以及主键 : Differences in lookup performance?
我有一个数据库(准确地说是在 postgres 上运行)，具有以下结构: user1 (schema) | - cars (table) - airplanes (table, again) .
performance - iOS/核心动画 : Performance tuning
我的应用程序在我的 iPad 上运行。但它的表现非常糟糕——我的速度低于 15fps。谁能帮我优化一下？它基本上是一个轮子(派生自 UIView)，包含 12 个按钮(派生自 UIControl)。
performance - coursera progfun1 : scala union performance
在完成“Scala 中的函数式编程原则”@coursera 类(class)第 3 周的作业时，我发现当我实现视频类(class)中所示的函数联合时: override def union(tha
performance - Symfony2 依赖注入(inject) : performances impact
我正在重构我的一个 Controller 以使其成为一项服务，我想知道不将整个服务容器注入(inject)我的 Controller 是否会对性能产生影响。这样效率更高吗: innova.path.
performance - facelet tag performance
我有一个要显示的内容很大的文件。例如在显示用户配置文件时，中的每个 EL 表达式需要一个 userId 作为 bean 的参数，该参数取自 session 上下文。我在 xhtml 文件中将这个 u
performance - OpenGL/DirectX : How does Mipmapping improve performance?
我非常了解 mipmapping。我不明白(在硬件/驱动程序级别)是 mipmapping 如何提高应用程序的性能(至少这是经常声称的)。在执行片段着色器之前，驱动程序不知道要访问哪个 mipmap
performance - Scala 惰性值 : performance penalty? 线程安全？
这个问题在这里已经有了答案: 10年前关闭。 Possible Duplicate: What's the (hidden) cost of lazy val? (Scala) Scala 允许定义惰
java - build().perform() 和 Perform() 之间有什么区别
一些文章建议现在 build() 包含在 perform() 本身中，而其他人则建议当要链接多个操作时使用 build().perform()一起。最佳答案 build() 包含在 perform(
performance - postgres 函数 : when does IMMUTABLE hurt performance?
Postgres docs说 For best optimization results, you should label your functions with the strictest vol
performance - 零成本抽象 : performance of for-loop vs. 迭代器
阅读Zero-cost abstractions看着 Introduction to rust: a low-level language with high-level abstractions我尝
performance - MQ : CPU Performance 上的 SSL
我想在 MQ 服务器上部署 SSL，但我想知道我当前的 CPU 容量是否支持 SSL。 (我没有预算增加 CPU 内核和 MQ PVU 的数量) 我的规范: Windows 2003 服务器 SP2，
performance - Chrome Performance Profiler 中的“Timings”选项卡丢失
因此，我在 Chrome 开发者工具的性能选项卡内的时间部分成功地监控了我的 React Native 应用程序的性能。突然在应用程序的特定重新加载时，Timings 标签丢失。我已尝试重置

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

performance - 为什么在 AMD Zen 上整数除法吞吐量与较大值的差异很小？