gpt4 book ai didi

nvidia - Cuda-memcheck 不报告越界共享内存访问

转载 作者:行者123 更新时间:2023-12-04 06:04:00 25 4
gpt4 key购买 nike

我正在使用共享内存运行以下代码:

    __global__ void computeAddShared(int *in , int *out, int sizeInput){
//not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code
extern __shared__ float temp[];

int tid = blockIdx.x * blockDim.x + threadIdx.x;
int ltid = threadIdx.x;
temp[ltid] = 0;
while(tid < sizeInput){
temp[ltid] += in[tid];
tid+=gridDim.x * blockDim.x; // to handle array of any size
}
__syncthreads();
int offset = 1;
while(offset < blockDim.x){
if(ltid % (offset * 2) == 0){
temp[ltid] = temp[ltid] + temp[ltid + offset];
}
__syncthreads();
offset*=2;
}
if(ltid == 0){
out[blockIdx.x] = temp[0];
}

}

int main(){

int size = 16; // size of present input array. Changes after every loop iteration
int cidata[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
/*FILE *f;
f = fopen("invertedList.txt" , "w");
a[0] = 1 + (rand() % 8);
fprintf(f, "%d,",a[0]);
for( int i = 1 ; i< N; i++){
a[i] = a[i-1] + (rand() % 8) + 1;
fprintf(f, "%d,",a[i]);
}
fclose(f);*/
int* gidata;
int* godata;
cudaMalloc((void**)&gidata, size* sizeof(int));
cudaMemcpy(gidata,cidata, size * sizeof(int), cudaMemcpyHostToDevice);
int TPB = 4;
int blocks = 10; //to get things kicked off
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
while(blocks != 1 ){
if(size < TPB){
TPB = size; // size is 2^sth
}
blocks = (size+ TPB -1 ) / TPB;
cudaMalloc((void**)&godata, blocks * sizeof(int));
computeAddShared<<<blocks, TPB,TPB>>>(gidata, godata,size);
cudaFree(gidata);
gidata = godata;
size = blocks;
}
//printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError()));


cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime , start, stop);
printf("time is %f ms", elapsedTime);
int *output = (int*)malloc(sizeof(int));
cudaMemcpy(output, gidata, sizeof(int), cudaMemcpyDeviceToHost);
//Cant free either earlier as both point to same location
cudaError_t chk = cudaFree(godata);
if(chk!=0){
printf("First chk also printed error. Maybe error in my logic\n");
}

printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError()));
printf("The sum of the array is %d\n", output[0]);
getchar();

return 0;
}

显然,computeAddShared 中的第一个 while 循环导致了越界错误,因为我将 4 个字节分配给共享内存。为什么 cudamemcheck 没有捕捉到这个。下面是 cuda-memcheck 的输出
========= CUDA-MEMCHECK
time is 12.334816 msThe error by threadsyn is no errorThe sum of the array is 13
6

========= ERROR SUMMARY: 0 errors

最佳答案

共享内存分配粒度。硬件无疑具有用于分配的页面大小(可能与 L1 缓存行侧相同)。如果每个块只有 4 个线程,那么单个页面中将“意外地”有足够的共享内存来让您进行代码工作。如果您使用了合理数量的线程块(即扭曲大小的整数倍),则会检测到错误,因为分配的内存不足。

关于nvidia - Cuda-memcheck 不报告越界共享内存访问,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/8580398/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com