c - 使用单精度浮点系统进行 double 浮点加/减/乘/除运算的简单 C 示例-6ren

c - 使用单精度浮点系统进行 double 浮点加/减/乘/除运算的简单 C 示例

转载作者：行者123 更新时间：2023-11-30 16:43:23

我正在研究一种需要大量计算的算法，最多可达 e+30。我使用的是 32 位系统，编译器支持 32 位长/浮点/ double 。到目前为止，通过网上搜索，我了解到单精度浮点(FP)可以用于 double 浮点。

从之前有人提出的问题(Emulate “double” using 2 “float”s)中，我发现这篇论文具有在 GPU 中使用 double FP 的算法。用 C 实现对我来说太困惑了。我只需要四个基本的数学运算。有什么办法可以找到一个例子来帮助我更好地理解它吗？

提前致谢。

这是我正在编写的代码。它可能有我看不到的错误，任何建议来纠正错误将不胜感激，但这正是我想要实现的。在算法中，POLYNOMIAL_ORDER应该能够上升到四阶(如果标准偏差较小，可以稳定在三阶)。我不确定的事情有:1)过程 make_float() 和 make_float() 是否正确，2) 在程序中使用 make_float()。

#define POLYNOMIAL_ORDER    (3)
#define TC_TABLE_SIZE   (14)

typedef struct vector_float2{
float x;
float y;
}float2;

typedef struct
{
    float tc0;
    float tc1;
    float tc2;
    float tc3;
}POLYNOMIALS;

typedef struct  {
    int16_t Temp;
    int16_t Comp;   
} TempCompPair;

volatile TempCompPair TCtable[TC_TABLE_SIZE] = {{22452,1651},
                                                {25318,1444},
                                                {28268,1133},
                                                {31120,822},
                                                {34027,511},
                                                {36932,185},
                                                {39770,-81},
                                                {42685,-288},
                                                {45531,-407},
                                                {48425,-632},
                                                {51401,-703},
                                                {54460,-1143},
                                                {57202,-1420},
                                                {60027,-1652}};

POLYNOMIALS polynomials;
float matrix[TC_TABLE_SIZE][TC_TABLE_SIZE] = {0};
float average[TC_TABLE_SIZE] = {0};

float make_float(float x, float y)
{
    return x+y;
}

float2 make_float2(float a, float b)
{
    float2 f2 = {a,b};
    return f2;
}

float2 quickTwoSum(float a, float b)
{
    float s = a+b;
    float e = b - (s - a);

    float2 result = {s, e};
    return result;
}

float2 twoSum(float a, float b)
{
    volatile float s = a + b;
    float v = s - a;
    float e = (a - (s - v)) + (b - v);
    float2 result = {s , e};
    return result;
}

float2 df64_add(float2 a, float2 b)
{
    float2 s,t;
    s = twoSum(a.x, b.x);
    t = twoSum(a.y, b.y);
    s.y += t.x;
    s = quickTwoSum(s.x, s.y);
    s.y += t.y;
    s = quickTwoSum(s.x, s.y);
    return s;
}

float2 split(float a)
{
    const float split = 4097;       //(1<<12) + 1
    float t = a *split;
    float a_hi = t - (t - a);
    float a_lo = a - a_hi;
    float2 result = {a_hi, a_lo};
    return result;
}

float2 twoProd(float a, float b)
{
    float p = a*b;
    float2 aS = split(a);
    float2 bS = split(b);
    float err = ((aS.x * bS.x - p)
                + aS.x * bS.y + aS.y * bS.x)
                + aS.y * bS.y;

    float2 result = {p, err};
    return result;
}

float2 df64_mult(float2 a, float2 b)
{
    float2 p;

    p = twoProd(a.x,b.x);
    p.y += a.x * b.y;
    p.y += a.y * b.x;
    p = quickTwoSum(p.x,p.y);

    return p;
}

float2 calculate_power(float base, int pow)
{
    int i = 0;

    float2 base_f2 = make_float2(base,0);
    float2 result_f2 = {1,0};

    if(pow == 0)
    {
        return result_f2;
    }

    if(pow > 0)
    {
        if(pow == 1)
        {
            return base_f2;
        }
        else
        {
            for(i = 0; i < pow; i++)
            {
                result_f2 = df64_mult(result_f2,base_f2);
            }
            return result_f2;
        }
    }
    else
    {
        return result_f2;
        //Mechanism for negative powers
    }

}

void TComp_Polynomial()
{
    int i;
    int j;
    int k;
    int size;
    float temp;
    float2 sum = {0,0};
    float2 result0 = {0,0};
    float2 result1 = {0,0};

    float x[TC_TABLE_SIZE];
    float y[TC_TABLE_SIZE];

    for(i = 0; i < TC_TABLE_SIZE; i++)
    {
        x[i] = (float) TCtable[i].Temp;
        y[i] = (float) TCtable[i].Comp;
    }

    size = i;

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        for(j = 0; j <= POLYNOMIAL_ORDER; j++)
        {
            sum.x = 0;
            sum.y = 0;

            for(k = 0; k < size; k++)
            {
                // Expression simplified below:  **sum += pow(x[k],i+j)** 
                result0 = calculate_power(x[k], i+j);
                sum = df64_add(result0,sum);
            }

            matrix[i][j] = make_float(sum.x,sum.y);
        }
    }

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        sum.x = 0;
        sum.y = 0;

        for(j = 0; j < size; j++)
        {
            // Expression simplified below: **sum += y[j] * pow(x[j],i)**
            result0 = calculate_power(x[j], i);
            result1 = df64_mult( result0 , make_float2(y[j],0) );
            sum = df64_add(result1,sum);
        }

        average[i] = make_float(sum.x,sum.y);
    }

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        for(j = 0; j <= POLYNOMIAL_ORDER; j++)
        {
            if(j != i)
            {
                if(matrix[i][i]!= 0)
                {
                    temp = matrix[j][i]/matrix[i][i];
                }

                for(k = i; k < POLYNOMIAL_ORDER; k++)
                {
                    matrix[j][k] -= temp*matrix[i][k];
                }
                average[j] -= temp*average[i];

            }
        }
    }

    if(matrix[0][0] != 0)
    {
        polynomials.tc0 = average[0]/matrix[0][0];
    }
    if(matrix[1][1] != 0)
    {
        polynomials.tc1 = average[1]/matrix[1][1];
    }

    if(matrix[2][2] != 0)
    {
        polynomials.tc2 = average[2]/matrix[2][2];
    }

    if(matrix[3][3] != 0)
    {
        polynomials.tc3 = average[3]/matrix[3][3];
    }
}

然后在下面的表达式中使用struct polynomials.tc0/1/2/3

// Y = T^3 * X3 + T^2 * X2 + T^1 * X1 + X0 ;

double calculate_equation(uint16_t TEMP)
{
    double Y;

    if(POLYNOMIAL_ORDER == 1)
    {
        Y = polynomials.tc1*(double)TEMP + polynomials.tc0; 
    }
    else if(POLYNOMIAL_ORDER == 2)
    {
        Y = (polynomials.tc2 * (double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0;  
    }
    else if(POLYNOMIAL_ORDER == 3)
    {
        Y = ((polynomials.tc3 * (double)TEMP + polynomials.tc2)*(double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0; 
    }
    else if(POLYNOMIAL_ORDER == 4)
    {
        Y = (((polynomials.tc4 * (double)TEMP + polynomials.tc3)*(double)TEMP + polynomials.tc2)*(double)TEMP + polynomials.tc1)*(double)TEMP + polynomials.tc0;    
    }

    return Y;
}

标准偏差计算如下:

//sqrt(sigma(error^2))
for(i = 0; i < TC_TABLE_SIZE; i++)
    {
        actual_comp[i] =(int) calculate_equation(TCtable[i].Temp);
        error[i] = TCtable[i].Comp - actual_comp[i] ;
        error_sqr += error[i]*error[i];

        printf("%u\t%d\t\t%e\n", TCtable[i].Temp, TCtable[i].Comp, actual_comp[i] );
    }
    error_sqrt = sqrt(error_sqr);

引用: http://hal.archives-ouvertes.fr/docs/00/06/33/56/PDF/float-float.pdf Guillaume Da Graça，David Defour 在图形硬件上实现浮点运算符，第七届实数和计算机 session ，RNC7。

最佳答案

我能够在不使用 double 的情况下实现此代码，因为计算在 Float 范围内。这是我的实现，如果我可以更好地优化它，请告诉我。

typedef struct
{   int64_t tc0;
    int64_t tc1;
    int64_t tc2;
    int64_t tc3;
    int64_t tc4;
}POLYNOMIALS;

POLYNOMIALS polynomials = {0,0,0,0,0};
int16_t TempCompIndex;
int64_t x[TC_TABLE_SIZE];
int64_t y[TC_TABLE_SIZE];

float matrix[POLYNOMIAL_ORDER+1][POLYNOMIAL_ORDER+1] = {0};
float average[POLYNOMIAL_ORDER+1] = {0};

void TComp_Polynomial()
{
    int i;
    int j;
    int k;
    int size;
    float temp;
    float sum = 0;
    float powr = 0;
    float prod;

    int64_t x[TC_TABLE_SIZE];
    int64_t y[TC_TABLE_SIZE];

    for(i = 0; i < TC_TABLE_SIZE; i++)
    {
        x[i] = (int64_t) TCtable[i].Temp;
        y[i] = (int64_t) TCtable[i].Comp<<PRECISION;
        printf("x: %lld, y:%lld\n",x[i],y[i]);
    }

    size = i;

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        for(j = 0; j <= POLYNOMIAL_ORDER; j++)
        {
            sum = 0;
            powr = 0;
            for(k = 0; k < size; k++)
            {       
                //printf("x[%d]: %ld, i: %d ,j: %d ", k, x[k],i,j);
                powr = pow(x[k],i+j);
                //printf("Power: %f, sum: %f\n ",powr,sum);
                sum +=  powr;
                //printf("%f\r\n",powr);
                //printf("sum: %lf\n",sum );
            }

            matrix[i][j] = sum;
            printf("sum: %g\n",sum);
        }
    }

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        sum = 0;
        powr = 0;

        for(j = 0; j < size; j++)
        {
            //sum += y[j] * pow(x[j],i)
            //printf("sum: %lf, y[%d]: %lf, x[%d]: %lf^%d  ",sum,j,y[j], i, x[j],i);
            //printf("x[%d]:%lld ^ %d\t",j,x[j],i);
            powr = (float) pow(x[j],i);
            printf("powr: %f\t",powr);

            prod = (float) y[j] * powr;
            printf("prod:%f \t %lld \t", prod,y[j]);

            sum += (float) prod;
            printf("sum: %f \n",sum);
        }

        average[i] = sum;
        //printf("#Avg: %f\n",average[i]);
    }
    printf("\n\n");

    for(i = 0; i <= POLYNOMIAL_ORDER; i++)
    {
        for(j = 0; j <= POLYNOMIAL_ORDER; j++)
        {
            if(j != i)
            {   
                if(matrix[i][i]!= 0)
                {
                    //printf("matrix%d%d: %g / matrix%d%d: %g =\t ",j,i,matrix[j][i],i,i,matrix[i][i]);
                    temp = matrix[j][i]/matrix[i][i];
                    //printf("Temp: %g\n",temp);
                }   

                for(k = i; k < POLYNOMIAL_ORDER; k++)
                {   
                    matrix[j][k] -= temp*matrix[i][k];
                    //printf("matrix[%d][%d]:%g, %g, matrix[%d][%d]:%g\n",j,k,matrix[j][k], temp,i,k,matrix[i][k]);
                }
                //printf("\n\n");
                //print_matrix();
                printf("\n\n");

                //printf("avg%d: %g\ttemp: %g\tavg%d: %g\n\n",j,average[j],temp,i,average[i]);
                average[j] -= temp*average[i];
                printf("#Avg%d:%g\n",j,average[j]);
                //print_average();
            }
        }
    }

    print_matrix();
    print_average();




/* Calculate polynomial Coefficients (n+1) based on the POLYNOMIAL_ORDER (n) */
#ifndef POLYNOMIAL_ORDER

#elif POLYNOMIAL_ORDER == 0
    if(matrix[0][0] != 0)
    {
        polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
    }
#elif POLYNOMIAL_ORDER == 1
    if(matrix[1][1] != 0)
    {
        polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
        polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
    }
#elif POLYNOMIAL_ORDER == 2
    if(matrix[2][2] != 0)
    {
        polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
        polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
        polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
    }
#elif POLYNOMIAL_ORDER == 3
    if(matrix[3][3] != 0)
    {
        polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
        polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
        polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
        polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
    }
#elif POLYNOMIAL_ORDER == 4
    if(matrix[4][4] != 0)
    {
        polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
        polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
        polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
        polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
        polynomials.tc4 = (int64_t) (average[4]/matrix[4][4]);
    }
#endif

    }



 int16_t calculate_equation(uint16_t TEMP)
{
    int64_t Y = 0;
    int16_t TempComp = 0;

#ifndef POLYNOMIAL_ORDER
#elif POLYNOMIAL_ORDER == 0
        Y = polynomials.tc0;
#elif POLYNOMIAL_ORDER == 1
        Y = polynomials.tc1* ((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 2
        Y = (polynomials.tc2 * ((int64_t)TEMP) + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 3
        Y = ((polynomials.tc3 * ((int64_t)TEMP) + polynomials.tc2)*((int64_t)TEMP) + polynomials.tc1)*((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 4
        Y = (((polynomials.tc4 * (int64_t)TEMP + polynomials.tc3)*(int64_t)TEMP + polynomials.tc2)*(int64_t)TEMP + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#endif
    TempComp = (int16_t) (Y>>PRECISION_BITS);

    return TempComp;
}

void main(){
int16_t TempComp = 0;
TempCompValue = (int16_t) calculate_equation(Mon_Temp);
}

注意:Calculate_Equation() 每秒调用一次，并且要求不使用浮点以避免浮点运算，因此我在该函数中使用非浮点变量。

它对我来说工作正常，并且在初始测试后没有发现任何错误。感谢大家对我的帖子感兴趣，如果没有答案，就得学习一些新技术。谢谢@chux。

关于c - 使用单精度浮点系统进行 double 浮点加/减/乘/除运算的简单 C 示例，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/45307459/

文章推荐： c# - 为什么返回类型的方法会导致动态的隐式类型化？

文章推荐： c - atoi — 如何识别零和错误之间的区别？

文章推荐： c# - 如何使用 NLog Elasticsearch Target 定义自定义字段

文章推荐： android - 模拟器:错误:无法加载OpenGLES仿真库

delphi - 单精度、 double 和精度
我知道存储单个值(或 double 值)不可能非常精确。因此，例如存储 125.12 可能会得到 125.1200074788。现在在delphi中，它们是一些有用的函数，例如samevalue或co
algorithm - IEEE754 单精度 - 表示数字一半的通用算法
假设 N 是根据 IEEE754 单精度标准表示的任意数字。我想在 IEEE754 中再次找到 N/2 的最精确可能表示。我想找到一个通用算法(用文字描述，我只想考虑必要的步骤和情况)来获得表示。
c - 单精度 float 在零附近时的近似分辨率是多少
我将许多经度和纬度存储为 doubles，我想知道我是否可以将它们存储为 floats。要回答这个问题，我需要知道 single precision floating point number 的近
单精度 IEEE 754 float 的格式
我需要以一种不会丢失任何信息的方式将单精度数字表示为文本(这样我就可以得到相同的数字，可能会忽略 NaN 等)，但没有太多的伪数字 - 所以单精度 0.1 出来了“0.1”不是“0.100000001
c - 单精度 float 乘以 2
这是一个家庭作业问题。我已经在网上找到了很多代码，包括StackOverflow中的一些代码。但我只想要概念而不是代码。我想自己实现。所以我要实现的功能是: float_twice - 返回浮点参数
c++ - IBM 单精度 float 据转换为预期值
我需要从二进制文件中读取值。数据格式为 IBM 单精度 float (4 字节十六进制指数数据)。我有 C++ 代码从文件中读取并取出每个字节并像这样存储它 unsigned char buf[BU
c++ - IBM 单精度 float 据转换为预期值
我需要从二进制文件中读取值。数据格式为 IBM 单精度 float (4 字节十六进制指数数据)。我有 C++ 代码从文件中读取并取出每个字节并像这样存储它 unsigned char buf[BU
c - 如何仅使用整数算术生成 IEEE 754 单精度 float ？
假设低端微处理器没有浮点运算，我需要生成一个 IEE754 单精度浮点格式数字以推送到文件。我需要编写一个函数，它接受三个整数(符号、整数和分数)，并返回一个字节数组，其中 4 个字节是 IEEE
python - 将二进制字符串转换为 IEEE-754 单精度 - Python
我有一个由 NumPy 创建的二进制矩阵。该矩阵有 5 行和 32 列。 array([[1, 1, ..., 1, 1], [0, 1, ..., 0, 1], [1, 1, ...,
python - 如何在不设置第 23 位的情况下在 python 中创建自定义 NaN(单精度)？
我正在尝试通过选择分数位来创建浮点 NaN。但似乎 python float 在解释 NaN 时总是设置第 23 个小数位(IEEE754 单)。所以，我的问题是:是否可以在不设置第 23 位的情
java - 从 IEEE 754 单精度(32 位)浮点列表创建字节流
有没有办法转换 IEEE 单精度(32 位)列表: String result = getdata(); String[] floats = result.split(","); List float
types - 为什么 IEEE754 单精度 float 只有 7 位精度？
为什么单精度 float 具有 7 位精度(或 double 15-16 位精度)？谁能解释一下我们是如何根据分配给 float(Sign(32) Exponent(30-23), Fraction
c++ - 单精度 float 的第 24 个小数位在哪里？ IEEE 754
今天我发现自己在做一些位操作，我决定稍微刷新一下我的浮点知识! 在我看到这个之前，一切都很好: ... 23 fraction bits of the significand appear in th
c - 如何检查是否使用了 IEEE 754 单精度(32 位)浮点表示法？
我想在我的目标板上测试以下内容: 'float' 是使用 IEEE 754 单精度(32 位)浮点变量实现的吗？ 'double' 是否使用 IEEE 754 double (64 位)浮点变量实现？
floating-point - IEEE-754 32 位(单精度)指数 -126 而不是 -127
我知道我是否有这样的号码: 1 | 1001 0001 | 0011 0011 0000 0001 0101 000 1 sign bit | 8 bit biased exponent | 23
java - 在 Java 中获取(十进制) float 的 IEEE 754(单精度)表示
我确定我遗漏了一些东西。我使用这个代码: int bitsVal = Float.floatToIntBits(f); String bitsString = Integer.toString(bit
ms-access - Access 查询的 CSV 导出将浮点(单精度/ double )值限制为小数点后 2 位
我花了几个小时将小数位数更改为 8，而不是使用 VBA Access 的 2。我找到了一些使用此标签来更改系统属性的解决方案: 公共(public)常量 LOCALE_ILZERO = &H12 但它

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c - 使用单精度浮点系统进行 double 浮点加/减/乘/除运算的简单 C 示例