gpt4 book ai didi

cuda - __ldg() 内在执行和正常执行之间有什么区别?

转载 作者:行者123 更新时间:2023-12-04 02:18:29 28 4
gpt4 key购买 nike

我正在尝试探索“__ldg 内在”。我已经阅读了 NVIDIA 的文档,但在其使用和实现方面没有得到任何令人满意的答案。此外引用THIS我尝试在一个简单的 1024*1024 矩阵乘法示例中实现 __ldg。

#include<stdio.h>
#include<stdlib.h>

__global__ void matrix_mul(float * ad,float * bd,float * cd,int N)
{
float pvalue=0;
//find Row and Column corresponding to a data element for each thread
int Row = blockIdx.y * blockDim.y + threadIdx.y;
int Col = blockIdx.x * blockDim.x + threadIdx.x;
//calculate dot product of Row of First Matrix and Column of Second Matrix
for(int i=0;i< N;++i)
{
// I tried with executing this first:
float m=__ldg(&ad[Row * N+i]);
float n=__ldg(&bd[i * N + Col]);

//Then I executed this as a normal execution:
// float m = ad[Row * N+i];
// float n = bd[i * N + Col];

pvalue += m * n;
}
//store dot product at corresponding position in resultant Matrix
cd[Row * N + Col] = pvalue;
}

int main()
{
int N = 1024,i,j; //N == size of square matrix

float *a,*b;
float *ad,*bd,*cd,*c;

//open a file for outputting the result
FILE *f;
f=fopen("Parallel Multiply_ldg.txt","w");

size_t size=sizeof(float)* N * N;

//allocate host side memory
a=(float*)malloc(size);
b=(float*)malloc(size);
c=(float*)malloc(size);

for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
a[i*N+j]=2.0; //(float)(i*N+j); //initializing each value with its own index
b[i*N+j]=1.0; //(float)(i*N+j); //random functions can be used alternatively
}
}

//allocate device memory
cudaMalloc(&ad,size);
//printf("\nAfter cudaMalloc for ad\n%s\n",cudaGetErrorString(cudaGetLastError()));
cudaMalloc(&bd,size);
//printf("\nAfter cudaMalloc bd\n%s\n",cudaGetErrorString(cudaGetLastError()));
cudaMalloc(&cd,size);
//printf("\nAfter cudaMalloc cd\n%s\n",cudaGetErrorString(cudaGetLastError()));

//copy value from host to device
cudaMemcpy(ad,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(bd,b,size,cudaMemcpyHostToDevice);

printf("\nAfter HostToDevice Memcpy\n%s\n",cudaGetErrorString(cudaGetLastError()));

//calculate execution configuration
dim3 blocksize(16,16); //each block contains 16 * 16 (=256) threads
dim3 gridsize(N/16,N/16); //creating just sufficient no of blocks

//GPU timer code
float time;
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);

matrix_mul <<< gridsize, blocksize >>> (ad,bd,cd, N);
cudaDeviceSynchronize();
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time,start,stop); //time taken in kernel call calculated
cudaEventDestroy(start);
cudaEventDestroy(stop);

//copy back results
cudaMemcpy(c,cd,sizeof(float)* N*N,cudaMemcpyDeviceToHost);

printf("\nAfter DeviceToHost Memcpy\n%s\n",cudaGetErrorString(cudaGetLastError()));

//output results in output_file
fprintf(f,"Array A was---\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
fprintf(f,"%f ",a[i*N+j]);
fprintf(f,"\n");
}
fprintf(f,"\nArray B was---\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
fprintf(f,"%f ",b[i*N+j]);
fprintf(f,"\n");
}
fprintf(f,"\nMultiplication of A and B gives C----\n");
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
fprintf(f,"%f ",c[i*N+j]); //if correctly computed, then all values must be N
fprintf(f,"\n");
}
printf("\nYou can see output in Parallel Mutiply.txt file in project directory");
printf("\n\nTime taken is %f (ms)\n",time);
fprintf(f,"\n\nTime taken is %f (ms)\n",time);
fclose(f);

cudaThreadExit();
//cudaFree(ad); cudaFree(bd); cudaFree (cd);
free(a);free(b);free(c);
//_getch();
return 1;
}

我评论了内核中的 __ldg 部分并通过正常执行执行,反之亦然。
在这两种情况下,它都会给我正确的乘法结果。我对这些执行之间的时差感到困惑,因为它几乎超过 100 倍!

在 __ldg 的情况下,它给了我: Time taken is 0.014432 (ms)
如果没有 __ldg 正常执行,它会给我: Time taken is 36.858398 (ms)
这是使用 __ldg intrisic 的确切方法吗? __ldg 内在的意义是什么,使用它的正确方法是什么?显然,我在上面的代码中所做的事情是错误且幼稚的。我正在寻找解释和示例。提前致谢。

最佳答案

来自 CUDA C Programming Guide

Global memory accesses for devices of compute capability 3.x are cached in L2 and for devices of compute capability 3.5, may also be cached in the read-only data cache described in the previous section; they are not cached in L1.



...

Data that is read-only for the entire lifetime of the kernel can also be cached in the read-only data cache described in the previous section by reading it using the __ldg() function (see Read-Only Data Cache Load Function). When the compiler detects that the read-only condition is satisfied for some data, it will use __ldg() to read it. The compiler might not always be able to detect that the read-only condition is satisfied for some data. Marking pointers used for loading such data with both the const and __restrict__ qualifiers increases the likelihood that the compiler will detect the read-only condition.



只读缓存访问的延迟比全局内存访问低得多。因为矩阵乘法多次访问内存中的相同值,所以在只读缓存中进行缓存会带来巨大的加速(在内存受限的应用程序中)。

关于cuda - __ldg() 内在执行和正常执行之间有什么区别?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/26603188/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com