gpt4 book ai didi

algorithm - 高效准确地计算 hypot(a,b) 的倒数

转载 作者:行者123 更新时间:2023-12-04 08:59:14 26 4
gpt4 key购买 nike

Givens 旋转提供了一种强大且易于并行化的方式来实现 QR 分解。 Givens 旋转需要计算旋转角度的正弦和余弦分量。在实际计算的情况下,这通常涉及计算 hypot() 的倒数。函数来归一化一个二向量,如 Wikipedia 中的示例所示.
虽然这避免了中间计算中的大多数上溢和下溢情况,但对于非常大的值 a , b , hypot(a,b)可能会溢出到无穷大,而 1/√(a2+b2) 实际上可以表示为次正规浮点数。此外,除法的使用进一步增加了计算成本,这在具有缓慢浮点除法的平台上可能很重要。
一个函数rhypot(a,b)以类似于标准 hypot() 的成本直接计算 1/√(a2+b2)因此,功能将是可取的。准确性应该与计算的天真方法相同或更好1.0/hypot(a,b) .使用正确的圆角 hypot函数,该表达式的最大误差为 1.5 ulps。
如何高效准确地实现这样的功能?可以假设使用 IEEE-754 二进制浮点算法和对融合乘加 (FMA) 运算的 native 硬件支持的可用性。为了便于说明和测试,我们可以限制为单精度计算,即 IEEE-754 binary32格式。

最佳答案

在下面,我将展示 ISO-C99 代码,该代码以良好的准确性和良好的性能实现 rhypot。通用算法直接源自我在 this answer 中为 hypot 展示的示例实现。对于 hypot ,可以确定参数中最大量级的值,然后找到一个比例因子(出于准确性原因是 2 的幂)将该值映射到单位附近。比例因子应用于两个参数,然后使用 sqrt 函数计算这个转换后的 2-vector 的长度,最后用比例因子的“逆”缩小结果。缩放依赖于实际乘法作为参数可能是无法通过简单的指数操作正确缩放的次法线。
对于 rhypot ,只需要进行两个更改:必须使用倒数平方根函数 rsqrt 而不是 sqrt ,并且输入缩放和结果缩放使用相同的缩放因子。
一些计算环境提供了 rsqrt() 函数,该函数计划包含在 ISO C 标准 (ISO/IEC TS 18661-4:2015) 的 future 版本中。对于不提供平方根倒数函数的环境,我展示了一些可移植的(在问题中所述的平台要求内)和特定于机器的实现。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

uint32_t __float_as_uint32 (float a)
{
uint32_t r;
memcpy (&r, &a, sizeof r);
return r;
}

float __uint32_as_float (uint32_t a)
{
float r;
memcpy (&r, &a, sizeof r);
return r;
}

float my_rsqrtf (float);

/* Compute the reciprocal of sqrt (a**2 + b**2), avoiding premature overflow
and underflow in intermediate computation. The accuracy of this function
depends on the accuracy of the reciprocal square root implementation used.
With the rsqrtf() implementations shown below, the following maximum ulp
error was observed for 2**36 random test cases:

CORRECTLY_ROUNDED 1.20736973
SSE_HALLEY 1.33120522
SSE_2NR 1.42086841
SQRT_OOX 1.42906701
BIT_TWIDDLE_3NR 1.43062950
ITO_TAKAGI_YAJIMA_1NR 1.43681737
BIT_TWIDDLE_NR_HALLEY 1.47485797
*/
float my_rhypotf (float a, float b)
{
float fa, fb, mn, mx, scale, s, w, res;
uint32_t expo;

/* sort arguments by magnitude */
fa = fabsf (a);
fb = fabsf (b);
mx = fmaxf (fa, fb);
mn = fminf (fa, fb);
/* compute scale factor */
expo = __float_as_uint32 (mx) & 0xfc000000;
scale = __uint32_as_float (0x7e000000 - expo);
/* scale operand of maximum magnitude towards unity */
mn = mn * scale;
mx = mx * scale;
/* mx in [2**-23, 2**6) */
s = fmaf (mx, mx, mn * mn); // 0.75 ulp
w = my_rsqrtf (s);
/* reverse previous scaling */
res = w * scale;
/* handle special cases */
float t = a + b;
if (!(fabsf (t) <= INFINITY)) res = t; // isnan(t)
if (mx == INFINITY) res = 0.0f; // isinf(mx)
return res;
}

#define CORRECTLY_ROUNDED (1)
#define SSE_HALLEY (2)
#define SSE_2NR (3)
#define ITO_TAKAGI_YAJIMA_1NR (4)
#define SQRT_OOX (5)
#define BIT_TWIDDLE_3NR (6)
#define BIT_TWIDDLE_NR_HALLEY (7)

#define RSQRT_VARIANT (SSE_HALLEY)

#if (RSQRT_VARIANT == SSE_2NR) || (RSQRT_VARIANT == SSE_HALLEY)
#include "immintrin.h"
#endif // (RSQRT_VARIANT == SSE_2NR) || (RSQRT_VARIANT == SSE_HALLEY)

float my_rsqrtf (float a)
{
#if RSQRT_VARIANT == CORRECTLY_ROUNDED
float r = (float) sqrt (1.0/(double)a);
#elif RSQRT_VARIANT == SQRT_OOX
float r = sqrtf (1.0f / a);
#elif RSQRT_VARIANT == SSE_2NR
float r;
/* compute initial approximation */
_mm_store_ss (&r, _mm_rsqrt_ss (_mm_set_ss (a)));
/* refine approximation using two Newton-Raphson iterations */
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
#elif RSQRT_VARIANT == SSE_HALLEY
float e, r;
/* compute initial approximation */
_mm_store_ss (&r, _mm_rsqrt_ss (_mm_set_ss (a)));
/* refine approximation using Halley iteration with cubic convergence */
e = fmaf (r * r, -a, 1.0f);
r = fmaf (fmaf (0.375f, e, 0.5f), e * r, r);
#elif RSQRT_VARIANT == BIT_TWIDDLE_3NR
float r;
/* compute initial approximation */
r = __uint32_as_float (0x5f375b0d - (__float_as_uint32(a) >> 1));
/* refine approximation using three Newton-Raphson iterations */
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
#elif RSQRT_VARIANT == BIT_TWIDDLE_NR_HALLEY
float e, r;
/* compute initial approximation */
r = __uint32_as_float (0x5f375b0d - (__float_as_uint32(a) >> 1));
/* refine approximation using Newton-Raphson iteration */
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
/* refine approximation using Halley iteration with cubic convergence */
e = fmaf (r * r, -a, 1.0f);
r = fmaf (fmaf (0.375f, e, 0.5f), e * r, r);
#elif RSQRT_VARIANT == ITO_TAKAGI_YAJIMA_1NR
/* Masayuki Ito, Naofumi Takagi, Shuzo Yajima, "Efficient Initial
Approximation for Multiplicative Division and Square Root by a
Multiplication with Operand Modification". IEEE Transactions on
Computers, Vol. 46, No. 4, April 1997, pp. 495-498.
*/
#define TAB_INDEX_BITS (7)
#define TAB_ENTRY_BITS (16)
#define TAB_ENTRIES (1 << TAB_INDEX_BITS)
#define FP32_EXPO_BIAS (127)
#define FP32_MANT_BITS (23)
#define FP32_SIGN_MASK (0x80000000)
#define FP32_EXPO_MASK (0x7f800000)
#define FP32_EXPO_LSB_MASK (1u << FP32_MANT_BITS)
#define FP32_INDEX_MASK (((1u << TAB_INDEX_BITS) - 1) << (FP32_MANT_BITS - TAB_INDEX_BITS))
#define FP32_XHAT_MASK (~(FP32_INDEX_MASK | FP32_SIGN_MASK) | FP32_EXPO_MASK)
#define FP32_FLIP_BIT_MASK (3u << (FP32_MANT_BITS - TAB_INDEX_BITS - 1))
#define FP32_ONE_HALF (0x3f000000)

const uint16_t d1tab [TAB_ENTRIES] = {
0xb2ec, 0xaed7, 0xaae9, 0xa720, 0xa37b, 0x9ff7, 0x9c93, 0x994d,
0x9623, 0x9316, 0x9022, 0x8d47, 0x8a85, 0x87d8, 0x8542, 0x82c0,
0x8053, 0x7bf0, 0x775f, 0x72f1, 0x6ea4, 0x6a77, 0x666a, 0x6279,
0x5ea5, 0x5aed, 0x574e, 0x53c9, 0x505d, 0x4d07, 0x49c8, 0x469e,
0x438a, 0x408a, 0x3d9e, 0x3ac4, 0x37fc, 0x3546, 0x32a0, 0x300b,
0x2d86, 0x2b10, 0x28a8, 0x264f, 0x2404, 0x21c6, 0x1f95, 0x1d70,
0x1b58, 0x194c, 0x174b, 0x1555, 0x136a, 0x1189, 0x0fb2, 0x0de6,
0x0c22, 0x0a68, 0x08b7, 0x070f, 0x056f, 0x03d8, 0x0249, 0x00c1,
0xfd08, 0xf742, 0xf1b4, 0xec5a, 0xe732, 0xe239, 0xdd6d, 0xd8cc,
0xd454, 0xd002, 0xcbd6, 0xc7cd, 0xc3e5, 0xc01d, 0xbc75, 0xb8e9,
0xb57a, 0xb225, 0xaeeb, 0xabc9, 0xa8be, 0xa5cb, 0xa2ed, 0xa024,
0x9d6f, 0x9ace, 0x983e, 0x95c1, 0x9355, 0x90fa, 0x8eae, 0x8c72,
0x8a45, 0x8825, 0x8614, 0x8410, 0x8219, 0x802e, 0x7c9c, 0x78f5,
0x7565, 0x71eb, 0x6e85, 0x6b31, 0x67f3, 0x64c7, 0x61ae, 0x5ea7,
0x5bb0, 0x58cb, 0x55f6, 0x5330, 0x5079, 0x4dd1, 0x4b38, 0x48ad,
0x462f, 0x43be, 0x4159, 0x3f01, 0x3cb5, 0x3a75, 0x3840, 0x3616
};
uint32_t arg, idx, d1, xhat;
float r;

arg = __float_as_uint32 (a);
idx = (arg >> ((FP32_MANT_BITS + 1) - TAB_INDEX_BITS)) & ((1u << TAB_INDEX_BITS) - 1);
d1 = FP32_ONE_HALF | (d1tab[idx] << ((FP32_MANT_BITS + 1) - TAB_ENTRY_BITS));
xhat = ((arg & FP32_INDEX_MASK) | (((((3 * FP32_EXPO_BIAS) << FP32_MANT_BITS) + ~arg) >> 1) & FP32_XHAT_MASK)) ^ FP32_FLIP_BIT_MASK;
/* compute initial approximation, accurate to about 14 bits */
r = __uint32_as_float (d1) * __uint32_as_float (xhat);
/* refine approximation with one Newton-Raphson iteration */
r = fmaf (fmaf (-a, r * r, 1.0f), 0.5f * r, r);
#else
#error unsupported RSQRT_VARIANT
#endif // RSQRT_VARIANT
return r;
}

uint64_t __double_as_uint64 (double a)
{
uint64_t r;
memcpy (&r, &a, sizeof r);
return r;
}

double floatUlpErr (float res, double ref)
{
uint64_t i, j, err, refi;
int expoRef;

/* ulp error cannot be computed if either operand is NaN, infinity, zero */
if (isnan (res) || isnan (ref) || isinf (res) || isinf (ref) ||
(res == 0.0f) || (ref == 0.0f)) {
return 0.0;
}
/* Convert the float result to an "extended float". This is like a float
with 56 instead of 24 effective mantissa bits.
*/
i = ((uint64_t)__float_as_uint32(res)) << 32;
/* Convert the double reference to an "extended float". If the reference is
>= 2^129, we need to clamp to the maximum "extended float". If reference
is < 2^-126, we need to denormalize because of the float types's limited
exponent range.
*/
refi = __double_as_uint64(ref);
expoRef = (int)(((refi >> 52) & 0x7ff) - 1023);
if (expoRef >= 129) {
j = 0x7fffffffffffffffULL;
} else if (expoRef < -126) {
j = ((refi << 11) | 0x8000000000000000ULL) >> 8;
j = j >> (-(expoRef + 126));
} else {
j = ((refi << 11) & 0x7fffffffffffffffULL) >> 8;
j = j | ((uint64_t)(expoRef + 127) << 55);
}
j = j | (refi & 0x8000000000000000ULL);
err = (i < j) ? (j - i) : (i - j);
return err / 4294967296.0;
}

double rhypot (double a, double b)
{
return 1.0 / hypot (a, b);
}

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579) /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)

#define FP32_QNAN_BIT (0x00400000)

int main (void)
{
float af, bf, resf, reff;
uint32_t ai, bi, resi, refi;
double ref, err, maxerr = 0;
uint64_t diff, diffsum = 0, count = 1ULL << 36;

do {
ai = KISS;
bi = KISS;
af = __uint32_as_float (ai);
bf = __uint32_as_float (bi);

resf = my_rhypotf (af, bf);
ref = rhypot ((double)af, (double)bf);
reff = (float)ref;

refi = __float_as_uint32 (reff);
resi = __float_as_uint32 (resf);

diff = llabs ((long long int)resi - (long long int)refi);
/* If both inputs are a NaN, result can be either argument, converted
to QNaN if necessary. If one input is NaN and the other not infinity
the NaN input must be returned, converted to QNaN if necessary. If
one input is infinity, zero must be returned even if the other input
is a NaN. In all other cases allow up to 1 ulp of difference.
*/
if ((isnan (af) && isnan (bf) && (resi != (ai | FP32_QNAN_BIT)) && (resi != (bi | FP32_QNAN_BIT))) ||
(isnan (af) && !isinf (bf) && !isnan (bf) && (resi != (ai | FP32_QNAN_BIT))) ||
(isnan (bf) && !isinf (af) && !isnan (af) && (resi != (bi | FP32_QNAN_BIT))) ||
(isinf (af) && (resi != 0)) ||
(isinf (bf) && (resi != 0)) ||
(diff > 1)) {
printf ("err @ (%08x,%08x): res= %08x (%15.8e) ref=%08x (%15.8e)\n",
ai, bi, resi, resf, refi, reff);
return EXIT_FAILURE;
}
diffsum += diff;
err = floatUlpErr (resf, ref);
if (err > maxerr) {
printf ("ulp=%.8f @ (% 15.8e, % 15.8e): res=%15.6a ref=%22.13a\n",
err, af, bf, resf, ref);
maxerr = err;
}
count--;
} while (count);
printf ("diffsum = %llu\n", diffsum);
return EXIT_SUCCESS;
}

关于algorithm - 高效准确地计算 hypot(a,b) 的倒数,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/63629892/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com