gcc - 为什么 GCC std::atomic 增量会产生低效的非原子组装？-6ren

gcc - 为什么 GCC std::atomic 增量会产生低效的非原子组装？

转载作者：行者123 更新时间：2023-12-04 01:52:31

我使用我自己的 __sync_fetch_and_add 已经使用 gcc 的 Intel 兼容内置函数(如 atomic)有一段时间了。模板。 “__sync ”函数现在被正式视为“遗留”。

C++11 支持 std::atomic<>及其后代，因此使用它似乎是合理的，因为它使我的代码符合标准，并且编译器将以独立于平台的方式生成最佳代码，这几乎好得令人难以置信。
顺便说一句，我只需要文本替换 atomic与 std::atomic ，也是。 std::atomic里面有很多东西(re:内存模型)我并不真正需要，但默认参数会照顾到这一点。

现在是坏消息。事实证明，生成的代码是，据我所知，......完全是废话，甚至根本不是原子的。即使是增加单个原子变量并输出它的最小示例也有不少于 5 个对 ___atomic_flag_for_address 的非内联函数调用。 , ___atomic_flag_wait_explicit , 和 __atomic_flag_clear_explicit (完全优化)，另一方面，生成的可执行文件中没有单个原子指令。

什么给？当然，编译器错误的可能性始终存在，但由于评论者和用户数量庞大，通常不太可能会忽视此类相当严重的事情。这意味着，这可能不是错误，而是预期的行为。

这么多函数调用背后的“原理”是什么，没有原子性如何实现原子性？

尽可能简单的例子:

#include <atomic>

int main()
{
    std::atomic_int a(5);
    ++a;
    __builtin_printf("%d", (int)a);
    return 0;
}

产生以下 .s :

movl    $5, 28(%esp)     #, a._M_i
movl    %eax, (%esp)     # tmp64,
call    ___atomic_flag_for_address   #
movl    $5, 4(%esp)  #,
movl    %eax, %ebx   #, __g
movl    %eax, (%esp)     # __g,
call    ___atomic_flag_wait_explicit     #
movl    %ebx, (%esp)     # __g,
addl    $1, 28(%esp)     #, MEM[(__i_type *)&a]
movl    $5, 4(%esp)  #,
call    _atomic_flag_clear_explicit  #
movl    %ebx, (%esp)     # __g,
movl    $5, 4(%esp)  #,
call    ___atomic_flag_wait_explicit     #
movl    28(%esp), %esi   # MEM[(const __i_type *)&a], __r
movl    %ebx, (%esp)     # __g,
movl    $5, 4(%esp)  #,
call    _atomic_flag_clear_explicit  #
movl    $LC0, (%esp)     #,
movl    %esi, 4(%esp)    # __r,
call    _printf  #
(...)
.def    ___atomic_flag_for_address; .scl    2;  .type   32; .endef
.def    ___atomic_flag_wait_explicit;   .scl    2;  .type   32; .endef
.def    _atomic_flag_clear_explicit;    .scl    2;  .type   32; .endef

...和提到的功能看起来例如像这样在 objdump :

004013c4 <__atomic_flag_for_address>:
mov    0x4(%esp),%edx
mov    %edx,%ecx
shr    $0x2,%ecx
mov    %edx,%eax
shl    $0x4,%eax
add    %ecx,%eax
add    %edx,%eax
mov    %eax,%ecx
shr    $0x7,%ecx
mov    %eax,%edx
shl    $0x5,%edx
add    %ecx,%edx
add    %edx,%eax
mov    %eax,%edx
shr    $0x11,%edx
add    %edx,%eax
and    $0xf,%eax
add    $0x405020,%eax
ret

其他的稍微简单一些，但我没有找到一个真正是原子的指令(除了一些虚假的 xchg 在 X86 上是原子的，但这些似乎是 NOP/填充，因为它是 xchg %ax,%ax 跟随 ret)。

我绝对不确定需要这样一个相当复杂的函数是为了什么，以及它是如何使任何东西成为原子的。

最佳答案

这是一个不适当的编译器构建。

检查您的 c++config.h ，它应该看起来像这样，但它不是:

/* Define if builtin atomic operations for bool are supported on this host. */
#define _GLIBCXX_ATOMIC_BUILTINS_1 1

/* Define if builtin atomic operations for short are supported on this host.
   */
#define _GLIBCXX_ATOMIC_BUILTINS_2 1

/* Define if builtin atomic operations for int are supported on this host. */
#define _GLIBCXX_ATOMIC_BUILTINS_4 1

/* Define if builtin atomic operations for long long are supported on this
   host. */
#define _GLIBCXX_ATOMIC_BUILTINS_8 1

这些宏的定义与否取决于 configure测试，检查主机是否支持 __sync_XXX功能。这些测试位于 libstdc++v3/acinclude.m4 , AC_DEFUN([GLIBCXX_ENABLE_ATOMIC_BUILTINS] ... .

在您的安装中，可以从 MEM[(__i_type *)&a] 中看出这一点。通过 -fverbose-asm 放入程序集文件编译器使用来自 atomic_0.h 的宏，例如:

#define _ATOMIC_LOAD_(__a, __x)                        \
  ({typedef __typeof__(_ATOMIC_MEMBER_) __i_type;                          \
    __i_type* __p = &_ATOMIC_MEMBER_;                      \
    __atomic_flag_base* __g = __atomic_flag_for_address(__p);          \
    __atomic_flag_wait_explicit(__g, __x);                 \
    __i_type __r = *__p;                           \
    atomic_flag_clear_explicit(__g, __x);                      \
    __r; })

使用正确构建的编译器，以及您的示例程序， c++ -m32 -std=c++0x -S -O2 -march=core2 -fverbose-asm应该产生这样的东西:

movl    $5, 28(%esp)    #, a.D.5442._M_i
lock addl   $1, 28(%esp)    #,
mfence
movl    28(%esp), %eax  # MEM[(const struct __atomic_base *)&a].D.5442._M_i, __ret
mfence
movl    $.LC0, (%esp)   #,
movl    %eax, 4(%esp)   # __ret,
call    printf  #

关于gcc - 为什么 GCC std::atomic 增量会产生低效的非原子组装？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/8121131/

文章推荐： r - 在 R 中编码查询字符串

文章推荐： NHibernate QueryOver 限制字符串长度

文章推荐： emacs:如何禁用 .# 文件创建

javascript - 低效/丑陋的替换功能
我写了一个函数，应该用值替换两个定界符之间的代码，它返回(我将其应用到的字符串是 HTML 对象的 .outerHTML)。这将类似于它在例如中的使用方式。 Vue.js 或 Angular。看起
python - Django - 查询重复/低效
好的，我有一个 Django View ，如下所示: @render_to('home/main.html') def login(request): # also tried Client.
c# - 在类之间传递 WinForm 控件是否危险/低效？
由于我创建的几乎所有项目都包含 ListView，因此我想到创建一个类，其中包含修改 ListView 的所有重要功能。它看起来像这样: 主窗体: ListViewFunctions LVF = ne
.net - FileStream.ReadByte - 低效 - 这是什么意思？
The default implementation on Stream creates a new single-byte array and then calls Read. While this
mysql - Drupal 数据库结构 - 高效/低效？
我当然不是 Drupal 专家，但我之前设计并构建了一些数据库，所以我对第 3 方团队正在处理的数据库结构感到困惑，我已经将 Sequel Pro 添加到其中虚拟内容。我认为如果使用 Drupal 的
Python 生成随机 : uuid v. md5 v. random 哪个最有效或*低效*
我想生成一个随机的短十六进制字符串(比如 8 位或 16 位)。有很多选择可以做到这一点，例如，从我的头顶开始: uuid.uuid4().hex[:8] md5().hexdigest()[:8]

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

gcc - 为什么 GCC std::atomic 增量会产生低效的非原子组装？