gpt4 book ai didi

x86 - 使用 AVX 最快实现指数函数

转载 作者:行者123 更新时间:2023-12-01 17:24:19 29 4
gpt4 key购买 nike

我正在寻找在 AVX 元素(单精度浮点)上运行的指数函数的高效(快速)近似。即 - __m256 _mm256_exp_ps( __m256 x ) 没有 SVML。

相对精度应该类似于 ~1e-6,或 ~20 尾数位(2^20 中的 1 部分)。

如果它是使用 Intel 内在函数以 C 风格编写的,我会很高兴。
代码应该是可移植的(Windows、macOS、Linux、MSVC、ICC、GCC 等)。

<小时/>

这类似于 Fastest Implementation of Exponential Function Using SSE ,但是这个问题正在寻找非常快且低精度的问题(当前的答案给出了大约 1e-3 的精度)。

此外,此问题正在寻找 AVX/AVX2(和 FMA)。但请注意,这两个问题的答案很容易在 SSE4 __m128 或 AVX2 __m256 之间移植,因此 future 的读者应该根据所需的精度/性能权衡进行选择。

最佳答案

avx_mathfun 中的 exp 函数使用范围缩减与切比雪夫近似多项式相结合来与 AVX 指令并行计算 8 exp-s。使用正确的编译器设置,确保在可能的情况下将 addpsmulps 融合到 FMA 指令。

改编 avx_mathfun 中的原始 exp 代码非常简单。到可移植(跨不同编译器)C/AVX2 内在函数代码。原始代码使用 gcc 风格的对齐属性和巧妙的宏。修改后的代码使用标准 _mm256_set1_ps() 代替,位于小测试代码和表格下方。修改后的代码需要AVX2。

以下代码用于简单测试:

int main(){
int i;
float xv[8];
float yv[8];
__m256 x = _mm256_setr_ps(1.0f, 2.0f, 3.0f ,4.0f ,5.0f, 6.0f, 7.0f, 8.0f);
__m256 y = exp256_ps(x);
_mm256_store_ps(xv,x);
_mm256_store_ps(yv,y);

for (i=0;i<8;i++){
printf("i = %i, x = %e, y = %e \n",i,xv[i],yv[i]);
}
return 0;
}

输出似乎没问题:

i = 0, x = 1.000000e+00, y = 2.718282e+00 
i = 1, x = 2.000000e+00, y = 7.389056e+00
i = 2, x = 3.000000e+00, y = 2.008554e+01
i = 3, x = 4.000000e+00, y = 5.459815e+01
i = 4, x = 5.000000e+00, y = 1.484132e+02
i = 5, x = 6.000000e+00, y = 4.034288e+02
i = 6, x = 7.000000e+00, y = 1.096633e+03
i = 7, x = 8.000000e+00, y = 2.980958e+03

修改后的代码(AVX2)是:

#include <stdio.h>
#include <immintrin.h>
/* gcc -O3 -m64 -Wall -mavx2 -march=broadwell expc.c */

__m256 exp256_ps(__m256 x) {
/* Modified code. The original code is here: https://github.com/reyoung/avx_mathfun

AVX implementation of exp
Based on "sse_mathfun.h", by Julien Pommier
http://gruntthepeon.free.fr/ssemath/
Copyright (C) 2012 Giovanni Garberoglio
Interdisciplinary Laboratory for Computational Science (LISC)
Fondazione Bruno Kessler and University of Trento
via Sommarive, 18
I-38123 Trento (Italy)
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
/*
To increase the compatibility across different compilers the original code is
converted to plain AVX2 intrinsics code without ingenious macro's,
gcc style alignment attributes etc. The modified code requires AVX2
*/
__m256 exp_hi = _mm256_set1_ps(88.3762626647949f);
__m256 exp_lo = _mm256_set1_ps(-88.3762626647949f);

__m256 cephes_LOG2EF = _mm256_set1_ps(1.44269504088896341);
__m256 cephes_exp_C1 = _mm256_set1_ps(0.693359375);
__m256 cephes_exp_C2 = _mm256_set1_ps(-2.12194440e-4);

__m256 cephes_exp_p0 = _mm256_set1_ps(1.9875691500E-4);
__m256 cephes_exp_p1 = _mm256_set1_ps(1.3981999507E-3);
__m256 cephes_exp_p2 = _mm256_set1_ps(8.3334519073E-3);
__m256 cephes_exp_p3 = _mm256_set1_ps(4.1665795894E-2);
__m256 cephes_exp_p4 = _mm256_set1_ps(1.6666665459E-1);
__m256 cephes_exp_p5 = _mm256_set1_ps(5.0000001201E-1);
__m256 tmp = _mm256_setzero_ps(), fx;
__m256i imm0;
__m256 one = _mm256_set1_ps(1.0f);

x = _mm256_min_ps(x, exp_hi);
x = _mm256_max_ps(x, exp_lo);

/* express exp(x) as exp(g + n*log(2)) */
fx = _mm256_mul_ps(x, cephes_LOG2EF);
fx = _mm256_add_ps(fx, _mm256_set1_ps(0.5f));
tmp = _mm256_floor_ps(fx);
__m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
mask = _mm256_and_ps(mask, one);
fx = _mm256_sub_ps(tmp, mask);
tmp = _mm256_mul_ps(fx, cephes_exp_C1);
__m256 z = _mm256_mul_ps(fx, cephes_exp_C2);
x = _mm256_sub_ps(x, tmp);
x = _mm256_sub_ps(x, z);
z = _mm256_mul_ps(x,x);

__m256 y = cephes_exp_p0;
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p1);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p2);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p3);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p4);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p5);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);

/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
imm0 = _mm256_add_epi32(imm0, _mm256_set1_epi32(0x7f));
imm0 = _mm256_slli_epi32(imm0, 23);
__m256 pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
return y;
}

int main(){
int i;
float xv[8];
float yv[8];
__m256 x = _mm256_setr_ps(1.0f, 2.0f, 3.0f ,4.0f ,5.0f, 6.0f, 7.0f, 8.0f);
__m256 y = exp256_ps(x);
_mm256_store_ps(xv,x);
_mm256_store_ps(yv,y);

for (i=0;i<8;i++){
printf("i = %i, x = %e, y = %e \n",i,xv[i],yv[i]);
}
return 0;
}

<小时/>如 @Peter Cordes points out ,应该可以将 _mm256_floor_ps(fx + 0.5f) 替换为 _mm256_round_ps(fx)。此外, mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); 和接下来的两行似乎是多余的。通过将 cephes_exp_C1cephes_exp_C2 组合到 inv_LOG2EF 中,可以进一步优化。这导致以下代码尚未经过彻底测试!

#include <stdio.h>
#include <immintrin.h>
#include <math.h>
/* gcc -O3 -m64 -Wall -mavx2 -march=broadwell expc.c -lm */

__m256 exp256_ps(__m256 x) {
/* Modified code from this source: https://github.com/reyoung/avx_mathfun

AVX implementation of exp
Based on "sse_mathfun.h", by Julien Pommier
http://gruntthepeon.free.fr/ssemath/
Copyright (C) 2012 Giovanni Garberoglio
Interdisciplinary Laboratory for Computational Science (LISC)
Fondazione Bruno Kessler and University of Trento
via Sommarive, 18
I-38123 Trento (Italy)
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)

*/
/*
To increase the compatibility across different compilers the original code is
converted to plain AVX2 intrinsics code without ingenious macro's,
gcc style alignment attributes etc.
Moreover, the part "express exp(x) as exp(g + n*log(2))" has been significantly simplified.
This modified code is not thoroughly tested!
*/


__m256 exp_hi = _mm256_set1_ps(88.3762626647949f);
__m256 exp_lo = _mm256_set1_ps(-88.3762626647949f);

__m256 cephes_LOG2EF = _mm256_set1_ps(1.44269504088896341f);
__m256 inv_LOG2EF = _mm256_set1_ps(0.693147180559945f);

__m256 cephes_exp_p0 = _mm256_set1_ps(1.9875691500E-4);
__m256 cephes_exp_p1 = _mm256_set1_ps(1.3981999507E-3);
__m256 cephes_exp_p2 = _mm256_set1_ps(8.3334519073E-3);
__m256 cephes_exp_p3 = _mm256_set1_ps(4.1665795894E-2);
__m256 cephes_exp_p4 = _mm256_set1_ps(1.6666665459E-1);
__m256 cephes_exp_p5 = _mm256_set1_ps(5.0000001201E-1);
__m256 fx;
__m256i imm0;
__m256 one = _mm256_set1_ps(1.0f);

x = _mm256_min_ps(x, exp_hi);
x = _mm256_max_ps(x, exp_lo);

/* express exp(x) as exp(g + n*log(2)) */
fx = _mm256_mul_ps(x, cephes_LOG2EF);
fx = _mm256_round_ps(fx, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
__m256 z = _mm256_mul_ps(fx, inv_LOG2EF);
x = _mm256_sub_ps(x, z);
z = _mm256_mul_ps(x,x);

__m256 y = cephes_exp_p0;
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p1);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p2);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p3);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p4);
y = _mm256_mul_ps(y, x);
y = _mm256_add_ps(y, cephes_exp_p5);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, x);
y = _mm256_add_ps(y, one);

/* build 2^n */
imm0 = _mm256_cvttps_epi32(fx);
imm0 = _mm256_add_epi32(imm0, _mm256_set1_epi32(0x7f));
imm0 = _mm256_slli_epi32(imm0, 23);
__m256 pow2n = _mm256_castsi256_ps(imm0);
y = _mm256_mul_ps(y, pow2n);
return y;
}

int main(){
int i;
float xv[8];
float yv[8];
__m256 x = _mm256_setr_ps(11.0f, -12.0f, 13.0f ,-14.0f ,15.0f, -16.0f, 17.0f, -18.0f);
__m256 y = exp256_ps(x);
_mm256_store_ps(xv,x);
_mm256_store_ps(yv,y);

/* compare exp256_ps with the double precision exp from math.h,
print the relative error */
printf("i x y = exp256_ps(x) double precision exp relative error\n\n");
for (i=0;i<8;i++){
printf("i = %i x =%16.9e y =%16.9e exp_dbl =%16.9e rel_err =%16.9e\n",
i,xv[i],yv[i],exp((double)(xv[i])),
((double)(yv[i])-exp((double)(xv[i])))/exp((double)(xv[i])) );
}
return 0;
}

下一个表通过将 exp256_ps 与 math.h 中的 double exp 进行比较,给出了某些点的准确性的印象。相对误差在最后一列。

i      x                     y = exp256_ps(x)      double precision exp        relative error

i = 0 x = 1.000000000e+00 y = 2.718281746e+00 exp_dbl = 2.718281828e+00 rel_err =-3.036785947e-08
i = 1 x =-2.000000000e+00 y = 1.353352815e-01 exp_dbl = 1.353352832e-01 rel_err =-1.289636419e-08
i = 2 x = 3.000000000e+00 y = 2.008553696e+01 exp_dbl = 2.008553692e+01 rel_err = 1.672817689e-09
i = 3 x =-4.000000000e+00 y = 1.831563935e-02 exp_dbl = 1.831563889e-02 rel_err = 2.501162103e-08
i = 4 x = 5.000000000e+00 y = 1.484131622e+02 exp_dbl = 1.484131591e+02 rel_err = 2.108215155e-08
i = 5 x =-6.000000000e+00 y = 2.478752285e-03 exp_dbl = 2.478752177e-03 rel_err = 4.380257261e-08
i = 6 x = 7.000000000e+00 y = 1.096633179e+03 exp_dbl = 1.096633158e+03 rel_err = 1.849522682e-08
i = 7 x =-8.000000000e+00 y = 3.354626242e-04 exp_dbl = 3.354626279e-04 rel_err =-1.101575118e-08

关于x86 - 使用 AVX 最快实现指数函数,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/48863719/

29 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com