gpt4 book ai didi

c - Float4 不比 cuda 中的 float 快

转载 作者:行者123 更新时间:2023-11-30 18:55:17 36 4
gpt4 key购买 nike

编辑: njuffa 是对的,这个版本是用 -G 编译的,它禁用了所有优化。由于加载和存储是矢量化的,新的 SASS 速度更快。

基于经典例子,我修改了cuda中 vector 加法的两个版本。事实是,float4 版本的长度是 float 版本的两倍,而数据大小却少了 4 倍。对两个内核的分析清楚地表明,float4 版本在每个事务中平均执行 4 次加载和 4 次存储,而 float 版本仅对这两个内核执行 1 次。这听起来像是一个关于 float4 访问未对齐的菜鸟问题,顺便说一句,下面的 PTX 证实了这一点,但我找不到哪里。

我正在使用带有 Quadro K4000 的 Cuda 7.0 rc。

关于去哪里看有什么想法吗?

编译选项?

__aligned__ keyword ?


__global__ void add_float(float *c, const float *a, const float *b)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i] + b[i];
}

__global__ void add_float4(float4 *c, const float4 *a, const float4 *b) {

int i = blockDim.x * blockIdx.x + threadIdx.x;

float4 a1 = a[i];
float4 b1 = b[i];

float4 c1;
c1.x = a1.x + b1.x;
c1.y = a1.y + b1.y;
c1.z = a1.z + b1.z;
c1.w = a1.w + b1.w;

c[i] = c1;
}

PTX上线:

float4 a1 = a[i];

说:

...
ld.f32 %f1, [%rd6];
ld.f32 %f2, [%rd6+4];
ld.f32 %f3, [%rd6+8];
ld.f32 %f4, [%rd6+12];
st.f32 [%SP+12], %f4;
st.f32 [%SP+8], %f3;
st.f32 [%SP+4], %f2;
st.f32 [%SP+0], %f1;
...

SASS objdump 说:

    /*0108*/                   MOV R10, R0;                             /* 0x2800000000029de4 */
/*0110*/ ISET.LT.AND R11, R0, RZ, PT; /* 0x108e0000fc02dc23 */
/*0118*/ MOV32I R13, 0x4; /* 0x1800000010035de2 */
/*0120*/ ISETP.LE.U32.AND P0, PT, R13, 0x20, PT; /* 0x198ec00080d1dc03 */
/*0128*/ ISUB R12, 0x20, R13; /* 0x4800c00080d31e03 */
/*0130*/ SHL R11, R11, R13; /* 0x6000000034b2dc03 */
/*0138*/ SHR.U32 R14, R10, R12; /* 0x5800000030a39c03 */
/* 0x22c2804282328047 */
/*0148*/ IADD R11, R11, R14; /* 0x4800000038b2dc03 */
/*0150*/ @!P0 IADD R12, R13, -0x20; /* 0x4800ffff80d32003 */
/*0158*/ @!P0 SHL R11, R10, R12; /* 0x6000000030a2e003 */
/*0160*/ SHL R10, R10, R13; /* 0x6000000034a29c03 */
/*0168*/ MOV R10, R10; /* 0x2800000028029de4 */
/*0170*/ MOV R11, R11; /* 0x280000002c02dde4 */
/*0178*/ IADD R8.CC, R8, R10; /* 0x4801000028821c03 */
/* 0x228042c042828047 */
/*0188*/ IADD.X R9, R9, R11; /* 0x480000002c925c43 */
/*0190*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0198*/ MOV R9, R9; /* 0x2800000024025de4 */
/*01a0*/ LD.E R10, [R8]; /* 0x8400000000829c85 */
/*01a8*/ IADD R12.CC, R8, 0x4; /* 0x4801c00010831c03 */
/*01b0*/ IADD.X R13, R9, RZ; /* 0x48000000fc935c43 */
/*01b8*/ MOV R12, R12; /* 0x2800000030031de4 */
/* 0x2202828042c2e287 */
/*01c8*/ MOV R13, R13; /* 0x2800000034035de4 */
/*01d0*/ LD.E R11, [R12]; /* 0x8400000000c2dc85 */
/*01d8*/ IADD R12.CC, R8, 0x8; /* 0x4801c00020831c03 */
/*01e0*/ IADD.X R13, R9, RZ; /* 0x48000000fc935c43 */
/*01e8*/ MOV R12, R12; /* 0x2800000030031de4 */
/*01f0*/ MOV R13, R13; /* 0x2800000034035de4 */
/*01f8*/ LD.E R12, [R12]; /* 0x8400000000c31c85 */
/* 0x2282c202828042c7 */
/*0208*/ IADD R8.CC, R8, 0xc; /* 0x4801c00030821c03 */
/*0210*/ IADD.X R9, R9, RZ; /* 0x48000000fc925c43 */
/*0218*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0220*/ MOV R9, R9; /* 0x2800000024025de4 */
/*0228*/ LD.E R8, [R8]; /* 0x8400000000821c85 */
/*0230*/ IADD R14.CC, R2, 0xc; /* 0x4801c00030239c03 */
/*0238*/ IADD.X R15, R3, RZ; /* 0x48000000fc33dc43 */
/* 0x22828042c2e28047 */
/*0248*/ MOV R14, R14; /* 0x2800000038039de4 */
/*0250*/ MOV R15, R15; /* 0x280000003c03dde4 */
/*0258*/ ST.E [R14], R8; /* 0x9400000000e21c85 */
/*0260*/ IADD R8.CC, R2, 0x8; /* 0x4801c00020221c03 */
/*0268*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*0270*/ MOV R8, R8; /* 0x2800000020021de4 */
/*0278*/ MOV R9, R9; /* 0x2800000024025de4 */
/* 0x22c2e2828042c2e7 */
/*0288*/ ST.E [R8], R12; /* 0x9400000000831c85 */
/*0290*/ IADD R8.CC, R2, 0x4; /* 0x4801c00010221c03 */
/*0298*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*02a0*/ MOV R8, R8; /* 0x2800000020021de4 */
/*02a8*/ MOV R9, R9; /* 0x2800000024025de4 */
/*02b0*/ ST.E [R8], R11; /* 0x940000000082dc85 */
/*02b8*/ IADD R8.CC, R2, RZ; /* 0x48010000fc221c03 */
/* 0x22820042e2828047 */
/*02c8*/ IADD.X R9, R3, RZ; /* 0x48000000fc325c43 */
/*02d0*/ MOV R8, R8; /* 0x2800000020021de4 */
/*02d8*/ MOV R9, R9; /* 0x2800000024025de4 */
/*02e0*/ ST.E [R8], R10; /* 0x9400000000829c85 */

这是剩下的:

void CudaTest()
{
int size = 8192;

float *dev_a = 0;
float *dev_b = 0;
float *dev_c = 0;
float *host_a = (float*)malloc(4 * size * sizeof(float));
float *host_b = (float*)malloc(4 * size * sizeof(float));
float *host_c = (float*)malloc(4 * size * sizeof(float));

float4 *dev_a4 = 0;
float4 *dev_b4 = 0;
float4 *dev_c4 = 0;
float4 *host_a4 = (float4*)malloc(size * sizeof(float4));
float4 *host_b4 = (float4*)malloc(size * sizeof(float4));
float4 *host_c4 = (float4*)malloc(size * sizeof(float4));

for (int i = 0; i < 4 * size; i++)
{
host_a[i] = rand() / RAND_MAX;
host_b[i] = rand() / RAND_MAX;
}

for (int i = 0; i < size; i++)
{
host_a4[i].x = rand() / RAND_MAX;
host_a4[i].y = rand() / RAND_MAX;
host_a4[i].z = rand() / RAND_MAX;
host_a4[i].w = rand() / RAND_MAX;
host_b4[i].x = rand() / RAND_MAX;
host_b4[i].y = rand() / RAND_MAX;
host_b4[i].z = rand() / RAND_MAX;
host_b4[i].w = rand() / RAND_MAX;
}

// Choose which GPU to run on, change this on a multi-GPU system.
CUDA_CALL(cudaSetDevice(0));

// Allocate GPU buffers for three vectors (two input, one output) .
CUDA_CALL(cudaMalloc((void**)&dev_c, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_a, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_b, 4 * size * sizeof(float)));
CUDA_CALL(cudaMalloc((void**)&dev_c4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_a4, size * sizeof(float4)));
CUDA_CALL(cudaMalloc((void**)&dev_b4, size * sizeof(float4)));

// Copy input vectors from host memory to GPU buffers.
CUDA_CALL(cudaMemcpy(dev_a, host_a, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b, host_b, 4 * size * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_a4, host_a4, size * sizeof(float4), cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(dev_b4, host_b4, size * sizeof(float4), cudaMemcpyHostToDevice));

int local = 256;
int N = size / local;
// Launch a kernel on the GPU with one thread for each element.
add_float << <4*N, local >> >(dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());

add_float4 << <N, local >> >(dev_c4, dev_a4, dev_b4);
// Check for any errors launching the kernel
CUDA_CALL(cudaGetLastError());

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
CUDA_CALL(cudaDeviceSynchronize());

// Copy output vector from GPU buffer to host memory.
CUDA_CALL(cudaMemcpy(host_c, dev_c, 4 * size * sizeof(float), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(host_c4, dev_c4, size * sizeof(float4), cudaMemcpyDeviceToHost));
}

最佳答案

使用 GPU 硬件提供的 vector 加载/存储指令被认为是编译器应用的性能优化,因为使用标量加载和存储的代码具有完整的功能。当 nvcc 使用 -G 编译代码时(通常用于调试),所有优化(包括加载和存储的矢量化)都会关闭。

要检查加载/存储向量化,重要的是要查看正在执行的实际机器代码 (SASS),而不是 PTX,PTX 只是一个中间代码,由称为 SASS 的优化编译器组件编译成 SASS ptxas 由驱动程序nvcc调用。在 nvcc 生成的可执行文件上运行 cuobjdump --dump-sass 以检查机器代码。

关于c - Float4 不比 cuda 中的 float 快,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28095261/

36 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com