gpt4 book ai didi

c++ - 如何在 gpu v100 上使用 fma?

转载 作者:行者123 更新时间:2023-11-28 04:04:30 28 4
gpt4 key购买 nike

我在 https://docs.nvidia.com/cuda/floating-point/index.html 上试过 FMA 的例子|

union  {
float f;
unsigned int i
} a, b;
float r;

a.i = 0x3F800001;
b.i = 0xBF800002;
r = a.f * a.f + b.f;

printf("a %.8g\n", a.f);
printf("b %.8g\n", b.f);
printf("r %.8g\n", r);

但是,我在 gpu 上得到了 0。我的test.cu如下图:

#include <stdio.h>
#include <iostream>
using namespace std;
#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
return EXIT_FAILURE;}} while(0)

#define BLOCKS 1 //useless
#define TPB 3 //useless
#define TIMES 5 //useless

__global__ void test() {
union {
float f;
unsigned int i;
} a, b;
float r;

a.i = 0x3F800001;
b.i = 0xBF800002;
r = a.f * a.f + b.f;
printf("a %.30g\n", a.f);
printf("b %.30g\n", b.f);
printf("r %.30g\n", r);
}

int main() {
float *devResults; //useless

CUDA_CALL(cudaMalloc((void **)&devResults, BLOCKS * TPB * TIMES *
sizeof(float)));

CUDA_CALL(cudaMemset(devResults, 0, BLOCKS * TPB * TIMES *
sizeof(float)));
test<<<1, 1>>>();
CUDA_CALL(cudaFree(devResults));
return 0;
}

我使用以下方法编译了 test.cu:nvcc test.cu --fmad=true

当我调用“fma”函数时,它起作用了。但是,它应该在没有“fma”的情况下工作。

最佳答案

您对正在发生的事情的解释不正确。在详细查看您的代码之前,请查看我对您尝试执行的操作的版本(让我们撇开技术上未定义行为的 union 问题):

#include <stdio.h>

typedef union {
float f;
unsigned int i;
} bodge;

__global__ void test(unsigned int x, unsigned int y, float* out, bool dowrite) {

bodge a, b;
a.i = x;
b.i = y;

float r = a.f * a.f + b.f;
printf("a %.30g\n", a.f);
printf("b %.30g\n", b.f);
printf("r %.30g\n", r);
if (dowrite) *out = r;
}

int main() {
test<<<1, 1>>>(0x3F800001, 0xBF800002, (float*)0, false);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}

在 Maxwell GPU 上像这样编译和运行:

$ nvcc -arch=sm_52 --fmad=true -o fmad fmad.cu
$ ./fmad
a 1.00000011920928955078125
b -1.0000002384185791015625
r 1.42108547152020037174224853516e-14

如果我们查看汇编程序的输出,我们会在 /*0058*/ 清楚地看到一条 FMAD 指令:

$ cuobjdump -sass fmad

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

code for sm_52
Function : _Z4testjjPfb
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001c4400fe0007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ { IADD32I R1, R1, -0x8; /* 0x1c0fffffff870101 */
/*0018*/ F2F.F64.F32 R10, c[0x0][0x140]; } /* 0x4ca8000005070b0a */
/* 0x001fc400fe2007f5 */
/*0028*/ MOV R17, c[0x0][0x140]; /* 0x4c98078005070011 */
/*0030*/ LOP.OR R2, R1, c[0x0][0x4]; /* 0x4c47020000170102 */
/*0038*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/* 0x001fc800fc2007f1 */
/*0048*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*0050*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0058*/ FFMA R17, R17, R17, c[0x0][0x144]; /* 0x5180088005171111 */
/* 0x0023c800ffe007f1 */
/*0068*/ LOP32I.AND R16, R2, 0xffffff; /* 0x04000ffffff70210 */
/*0070*/ MOV R6, R2; /* 0x5c98078000270006 */
/*0078*/ STL.64 [R16], R10; /* 0xef5500000007100a */
/* 0x001c4400fe000ffd */
/*0088*/ JCAL 0x0; /* 0xe220000000000040 */
/*0090*/ { MOV32I R4, 0x0; /* 0x010000000007f004 */
/*0098*/ F2F.F64.F32 R10, c[0x0][0x144]; } /* 0x4ca8000005170b0a */
/* 0x001ffc00fe2007f1 */
/*00a8*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*00b0*/ MOV R6, R2; /* 0x5c98078000270006 */
/*00b8*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/* 0x001fc001ffa008f2 */
/*00c8*/ STL.64 [R16], R10; /* 0xef5500000007100a */
/*00d0*/ JCAL 0x0; /* 0xe220000000000040 */
/*00d8*/ { MOV R6, R2; /* 0x5c98078000270006 */
/*00e8*/ F2F.F64.F32 R10, R17; } /* 0x001fc400fe200711 */
/* 0x5ca8000001170b0a */
/*00f0*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/*00f8*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/* 0x003ff4011e4007ff */
/*0108*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0110*/ STL.64 [R16], R10; /* 0xef5500000007100a */
/*0118*/ JCAL 0x0; /* 0xe220000000000040 */
/* 0x003fb401e3a0071f */
/*0128*/ LDC.U8 R0, c[0x0][0x150]; /* 0xef9000001507ff00 */
/*0130*/ I2I.S16.S8 R0, R0; /* 0x5ce0000000073100 */
/*0138*/ LOP.AND.NZ P0, RZ, R0, 0xff; /* 0x384030000ff700ff */
/* 0x001fc800fe2007fd */
/*0148*/ @!P0 EXIT; /* 0xe30000000008000f */
/*0150*/ MOV R2, c[0x0][0x148]; /* 0x4c98078005270002 */
/*0158*/ MOV R3, c[0x0][0x14c]; /* 0x4c98078005370003 */
/* 0x001ffc00ffe000f1 */
/*0168*/ STG.E [R2], R17; /* 0xeedc200000070211 */
/*0170*/ EXIT; /* 0xe30000000007000f */
/*0178*/ BRA 0x178; /* 0xe2400fffff87000f */
.......................

那么现在让我们看看您的内核代码:

$ nvcc -arch=sm_52 --fmad=true -o fmad fmad.cu
$ ./fmad
a 1.00000011920928955078125
b -1.0000002384185791015625
r 0

不同的结果。反汇编说明了原因:

$ cuobjdump -sass fmad

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

code for sm_52
Function : _Z5test0v
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001fc400fec007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ IADD32I R1, R1, -0x8; /* 0x1c0fffffff870101 */
/*0018*/ LOP.OR R2, R1, c[0x0][0x4]; /* 0x4c47020000170102 */
/* 0x001fc400fe2007f1 */
/*0028*/ MOV32I R10, 0x20000000; /* 0x010200000007f00a */
/*0030*/ MOV32I R11, 0x3ff00000; /* 0x0103ff000007f00b */
/*0038*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/* 0x001fc000fe4007e2 */
/*0048*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*0050*/ LOP32I.AND R16, R2, 0xffffff; /* 0x04000ffffff70210 */
/*0058*/ { MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0068*/ STL.64 [R16], R10; } /* 0x003ff400fec000f1 */
/* 0xef5500000007100a */
/*0070*/ MOV R6, R2; /* 0x5c98078000270006 */
/*0078*/ JCAL 0x0; /* 0xe220000000000040 */
/* 0x001fc000fe4007f1 */
/*0088*/ MOV32I R10, 0x40000000; /* 0x010400000007f00a */
/*0090*/ MOV32I R11, 0xbff00000; /* 0x010bff000007f00b */
/*0098*/ { MOV32I R4, 0x0; /* 0x010000000007f004 */
/*00a8*/ STL.64 [R16], R10; } /* 0x001fc400fe2000f1 */
/* 0xef5500000007100a */
/*00b0*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/*00b8*/ MOV R6, R2; /* 0x5c98078000270006 */
/* 0x001fc001ffa007e6 */
/*00c8*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*00d0*/ JCAL 0x0; /* 0xe220000000000040 */
/*00d8*/ { MOV R6, R2; /* 0x5c98078000270006 */
/*00e8*/ STL.64 [R16], RZ; } /* 0x001fc400fe2000f1 */
/* 0xef550000000710ff */
/*00f0*/ MOV32I R4, 0x0; /* 0x010000000007f004 */
/*00f8*/ MOV32I R5, 0x0; /* 0x010000000007f005 */
/* 0x001ffc01ffa007f6 */
/*0108*/ MOV R7, RZ; /* 0x5c9807800ff70007 */
/*0110*/ JCAL 0x0; /* 0xe220000000000040 */
/*0118*/ EXIT; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*0128*/ BRA 0x120; /* 0xe2400fffff07000f */
/*0130*/ NOP; /* 0x50b0000000070f00 */
/*0138*/ NOP; /* 0x50b0000000070f00 */
....................

可以看到根本没有浮点指令。为什么?因为编译器已经确定内核中的所有内容都是常量,所以预先计算结果并将其代入发出的代码是安全的。我假设计算是以更高的精度完成的,这就是结果为 0 的原因(或者存在内部编译器错误)。

关于c++ - 如何在 gpu v100 上使用 fma?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58989033/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com