gpt4 book ai didi

c++ - 使用它运行内核函数后无法访问统一内存

转载 作者:搜寻专家 更新时间:2023-10-31 01:32:15 25 4
gpt4 key购买 nike

所以我在我的代码中为 2 个函数调用 cudaMallocManaged,它对第一个函数(backwardMask())工作正常,在我调用它之后我可以轻松地从主机访问我的数据,但我的问题是内核函数 seriesLength() -因为我在我的索引掩码上做 cudaMallocManaged 然后(在调用 seriesLength() 之前)我可以轻松地访问/修改主机上的这个索引掩码,在我调用 seriesLength() 之后它正在修改我的索引掩码并且访问它也没有问题,但是之后此函数返回我无法读取主机上的索引掩码并且出现异常(状态代码 0xC0000022)。

这是一个非常奇怪的错误,因为我正在类比第一个函数(backwardMask()),它可以正常工作。

任何想法/解释将不胜感激。

这是 seriesLengths 内核函数代码:

__global__ void seriesLengths(int* scannedbw,int* indexmask,int* numOfSeries,int n){
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n;i+=stride)
{
if (i == (n - 1))
{
*numOfSeries = scannedbw[i];
indexmask[scannedbw[i]] = n;
}
if (i == 0)
{
indexmask[0] = 0;
}
else if (scannedbw[i] != scannedbw[i - 1])
{
indexmask[scannedbw[i] - 1] = i;
}
}
}

内核函数b​​ackwardMask的代码:

__global__ void backwardMask(const char *in, int* bwMask,int n)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n;i+=stride)
{
if (i == 0)
bwMask[i] = 1;
else
{
bwMask[i] = (in[i] != in[i - 1]);
}
}
}

主要功能:

int main()
{
int N=1024;
srand(time(0));
char* t;
int* bwmask;
cudaMallocManaged(&t, N*sizeof(char));
cudaMallocManaged(&bwmask, N*sizeof(int));
for (int i = 0; i < N; i++)
{
if(i<300)
t[i] = 'a' + rand() % 2;
else
t[i] = 'a' + rand() % 20;

}

for (int j = 0; j < 60; j++)
std::cout << t[j];
std::cout << std::endl;
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
backwardMask<<<numBlocks, blockSize >>>(t,bwmask , N);
cudaDeviceSynchronize();
for (int j = 0; j < 60; j++)
std::cout << bwmask[j];
std::cout << std::endl;
//now inclusive prefix sum for bwmask
int* scannedbwmask;
cudaMallocManaged(&scannedbwmask, N*sizeof(int));

thrust::inclusive_scan(bwmask, bwmask + N, scannedbwmask);
cudaDeviceSynchronize();

int numOfSeries;
//seriesLengths shows us lengths of each series by i-(i-1) and starting index of each series
int* indexmask;
cudaMallocManaged(&indexmask, (N+1)*sizeof(int));
seriesLengths<<<numBlocks, blockSize>>>(scannedbwmask, indexmask, &numOfSeries, N);
cudaDeviceSynchronize();

// accessing indexmask here gives us exception
std::cout << indexmask[3];
/*for (int j = 0; j < 60; j++)
std::cout << indexmask[j];
std::cout << std::endl;*/
std::cout << "numseries " << numOfSeries;


getch();
return 0;
}

最佳答案

将 numOfSeries 更改为指向 int 的指针

int* numOfSeries;

然后为它分配内存:

cudaMallocManaged(&numOfSeries, sizeof(int));

然后这样传递:

seriesLengths<<<numBlocks, blockSize>>>(scannedbwmask, indexmask, numOfSeries, N);

关于c++ - 使用它运行内核函数后无法访问统一内存,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43679680/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com