gpt4 book ai didi

c++ - 为什么来自对齐的 std::array 的初始自动矢量化加载是标量? (g++/叮当++)

转载 作者:行者123 更新时间:2023-11-30 05:19:05 25 4
gpt4 key购买 nike

我无法理解是什么阻止编译器在从 std::array 读取数据时使用初始 vector 加载。

我知道 gcc 可以使用 -fopt-info-vec-* 生成调试信息。我无法从详细的日志中找到任何内容来说明为什么两个编译器都做出相同的次优决定来使用初始标量加载。

另一方面,我不知道如何让 clang 提供有关矢量化问题的详细信息。 -Rpass-analysis=loop-vectorize 仅报告 init 中的循环不值得交错。当然,我的内在版本证明循环可以矢量化,但所需的转换可能太复杂,除了编译器之外。

我当然可以使用内部函数实现热路径,但这需要为每个 cpu 架构复制相同的逻辑。我更愿意编写编译器可以近乎完美地矢量化的标准 C++ 代码。使用 target_clones 属性或宏和 target 属性,使用不同的标志多次编译相同的代码变得很简单。

如何让编译器知道加载向量化失败的原因?

我怀疑 gcc 可能已经打印了我只是不知道我在找什么的信息。

为什么自动矢量化在初始加载时失败?

    /**
* This is a test case removing abstraction layers from my actual code. My
* real code includes one extra problem that access to pack loses alignment
* information wasn't only issue. Compilers still generate
* suboptimal machine code with alignment information present. I fail to
* understand why loads are treated differently compared to stores to
* same address when auto-vectorization is used.
*
* I tested gcc 6.2 and clang 3.9
* g++ O3 -g -march=native vectest.cc -o vectest -fvect-cost-model=unlimited
* clang++ -O3 -g -march=native vectest.cc -o vectest
*/


#include <array>
#include <cstdint>

alignas(32) std::array<uint64_t, 52> pack;
alignas(32) uint64_t board[4];

__attribute__((noinline))
static void init(uint64_t initial)
{
/* Clang seem to prefer large constant table and unrolled copy
* which should perform worse outside micro benchmark. L1 misses
* and memory bandwidth are bigger bottleneck than alu instruction
* execution. But of course this code won't be compiled to hot path so
* I don't care how it is compiled as long as it works correctly.
*
* But most interesting detail from clang is vectorized stores are
* generated correctly like:
4005db: vpsllvq %ymm2,%ymm1,%ymm2
4005e0: vmovdqa %ymm2,0x200a78(%rip) # 601060 <pack>
4005e8: vpaddq 0x390(%rip),%ymm0,%ymm2 # 400980 <_IO_stdin_used+0x60>
4005f0: vpsllvq %ymm2,%ymm1,%ymm2
4005f5: vmovdqa %ymm2,0x200a83(%rip) # 601080 <pack+0x20>
4005fd: vpaddq 0x39b(%rip),%ymm0,%ymm2 # 4009a0 <_IO_stdin_used+0x80>
*
* gcc prefers scalar loop.
*/

for (unsigned i = 0; i < pack.size(); i++) {
pack[i] = 1UL << (i + initial);
}
}

#include "immintrin.h"
__attribute__((noinline))
static void expected_init(uint64_t initial)
{
/** Just an intrinsic implementation of init that would be IMO ideal
* optimization.
*/
#if __AVX2__
unsigned i;
union {
uint64_t *mem;
__m256i *avx;
} conv;
conv.mem = &pack[0];
__m256i t = _mm256_set_epi64x(
1UL << 3,
1UL << 2,
1UL << 1,
1UL << 0
);
/* initial is just extra random number to prevent constant array
* initialization
*/
t = _mm256_slli_epi64(t, initial);
for(i = 0; i < pack.size()/4; i++) {
_mm256_store_si256(&conv.avx[i], t);
t = _mm256_slli_epi64(t, 4);
}
#endif
}

__attribute__((noinline))
static void iter_or()
{
/** initial load (clang):
4006f0: vmovaps 0x200988(%rip),%xmm0 # 601080 <pack+0x20>
4006f8: vorps 0x200960(%rip),%xmm0,%xmm0 # 601060 <pack>
400700: vmovaps 0x200988(%rip),%xmm1 # 601090 <pack+0x30>
400708: vorps 0x200960(%rip),%xmm1,%xmm1 # 601070 <pack+0x10>
400710: vinsertf128 $0x1,%xmm1,%ymm0,%ymm0
* expected:
400810: vmovaps 0x200868(%rip),%ymm0 # 601080 <pack+0x20>
400818: vorps 0x200840(%rip),%ymm0,%ymm0 # 601060 <pack>
400820: vorps 0x200878(%rip),%ymm0,%ymm0 # 6010a0 <pack+0x40>
*/

auto iter = pack.begin();
uint64_t n(*iter++),
e(*iter++),
s(*iter++),
w(*iter++);
for (;iter != pack.end();) {
n |= *iter++;
e |= *iter++;
s |= *iter++;
w |= *iter++;
}
/** Store is correctly vectorized to single instruction */
board[0] = n;
board[1] = e;
board[2] = s;
board[3] = w;
}

__attribute__((noinline))
static void index_or()
{
/** Clang compiles this to same as iterator variant. gcc goes
* completely insane. I don't even want to try to guess what all the
* permutation stuff is trying to archive.
*/
unsigned i;
uint64_t n(pack[0]),
e(pack[1]),
s(pack[2]),
w(pack[3]);
for (i = 4 ; i < pack.size(); i+=4) {
n |= pack[i+0];
e |= pack[i+1];
s |= pack[i+2];
w |= pack[i+3];
}
board[0] = n;
board[1] = e;
board[2] = s;
board[3] = w;
}

#include "immintrin.h"

__attribute__((noinline))
static void expected_result()
{
/** Intrinsics implementation what I would expect auto-vectorization
* transform my c++ code. I simple can't understand why both compilers
* fails to archive results I expect.
*/
#if __AVX2__
union {
uint64_t *mem;
__m256i *avx;
} conv;
conv.mem = &pack[0];
unsigned i;
__m256i res = _mm256_load_si256(&conv.avx[0]);
for (i = 1; i < pack.size()/4; i++) {
__m256i temp = _mm256_load_si256(&conv.avx[i]);
res = _mm256_or_si256(res, temp);
}
conv.mem = board;
_mm256_store_si256(conv.avx, res);
#endif
}

int main(int c, char **v)
{
(void)v;
expected_init(c - 1);
init(c - 1);

iter_or();
index_or();
expected_result();
}

最佳答案

gcc 和 clang 似乎都无法矢量化来自外部循环的初始负载。如果首先将代码更改为仅零个临时变量,然后使用或从第一个元素开始,两个编译器都会做得更好。 Clang 生成良好的展开 vector 代码(只有单个 ymm 寄存器是瓶颈,所有指令都依赖于前一个指令)。 GCC 使用额外的初始 vpxor 和每次迭代执行一个 vpor 的非常糟糕的循环生成更糟糕的代码。

我还测试了一些替代实现,其中最好的微基准测试是使用交替寄存器改进的 clangs 展开代码。

/* only reduce (calling this function from a for loop):
* ST 7.3 cycles (ST=single thread)
* SMT 15.3 cycles (SMT=simultaneous multi threading aka hyper threading)
* shuffle+reduce (calling Fisher-Yatas shuffle and then this function):
* ST 222 cycles
* SMT 383 cycles
*/
"vmovaps 0x00(%0), %%ymm0\n"
"vmovaps 0x20(%0), %%ymm1\n"
"vpor 0x40(%0), %%ymm0, %%ymm0\n"
"vpor 0x60(%0), %%ymm1, %%ymm1\n"
"vpor 0x80(%0), %%ymm0, %%ymm0\n"
"vpor 0xA0(%0), %%ymm1, %%ymm1\n"
"vpor 0xC0(%0), %%ymm0, %%ymm0\n"
"vpor 0xE0(%0), %%ymm1, %%ymm1\n"
"vpor 0x100(%0), %%ymm0, %%ymm0\n"
"vpor 0x120(%0), %%ymm1, %%ymm1\n"
"vpor 0x140(%0), %%ymm0, %%ymm0\n"
"vpor 0x160(%0), %%ymm1, %%ymm1\n"
"vpor 0x180(%0), %%ymm0, %%ymm0\n"

"vpor %%ymm0, %%ymm1, %%ymm0\n"
"vmovaps %%ymm0, 0x00(%1)\n"

Clang 展开循环的时间如下

/* only reduce:
* ST 9.8 cycles
* SMT 21.8 cycles
* shuffle+reduce:
* ST 223 cycles
* SMT 385 cycles
*/

但 SMT 降低展开代码性能的数字看起来很可疑。我决定尝试更好地编写仍然明显比展开慢的 GCC 循环。但后来我决定通过使用两个寄存器和一次展开循环来打破指令依赖性。这导致 shuffle+reduce 代码比完全展开稍微快一些。

size_t end = pack.size() - 3*4;
asm (
/* The best SMT option outside micro optimization.
* This allows executing two vpor instructions same time and
* reduces loop count to half with single unroll
*
* only reduce:
* ST 13.0 cycles
* SMT 20.0 cycles
* shuffle+reduce:
* ST 221 cycles
* SMT 380 cycles
*/
"vmovaps 0x180(%[pack]), %%ymm0\n"
"vmovaps 0x160(%[pack]), %%ymm1\n"
"vpor 0x00(%[pack],%[cnt],8), %%ymm0, %%ymm0\n"
"1:\n"
"vpor -0x20(%[pack],%[cnt],8), %%ymm1, %%ymm1\n"
"vpor -0x40(%[pack],%[cnt],8), %%ymm0, %%ymm0\n"
"sub $8, %[cnt]\n"
"jne 1b\n"

"vpor %%ymm0, %%ymm1, %%ymm0\n"
"vmovaps %%ymm0, 0x00(%[out])\n"
: [cnt]"+r"(end)
: [pack]"r"(begin), [out]"r"(hands_));

但是当代码在 Fisher-Yates 洗牌之后运行时,差异小得惊人。即使 gcc 版本在 reduce only 基准测试 (16.4/38.8) 中明显丢失,运行 shuffle+reduce 测试也接近相同的速度 (228/387)。

关于c++ - 为什么来自对齐的 std::array 的初始自动矢量化加载是标量? (g++/叮当++),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41310990/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com