gpt4 book ai didi

c++ - 使用 cudaMemcpy 时出现段错误

转载 作者:行者123 更新时间:2023-11-30 02:14:07 26 4
gpt4 key购买 nike

我正在尝试使用 cudaMemcpy 将 std::vector::data 用于设备内核的数组,它给出了设置错误。我的做法是:

  cudaMemcpy(d_x, vx.data(), N*sizeof(float), cudaMemcpyHostToDevice);

其中 vx 是 vector 。以下是完整的示例。任何关于问题出在哪里的提示都将不胜感激。

#include <iostream>
#include <math.h>
#include <vector>

using namespace std;

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i < n) {
y[i] = x[i] + y[i];
}
}


int main(void)
{
int N = 1<<10;
float *d_x = NULL, *d_y = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*N);
cudaMalloc((void **)&d_y, sizeof(float)*N);

// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;

// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
vx.push_back(1.0f);
vy.push_back(2.0f);
}

cudaMemcpy(d_x, vx.data(), N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), N*sizeof(float), cudaMemcpyHostToDevice);
//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
// maximum occupancy for a full device launch
int gridSize; // The actual grid size needed, based on input size

cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, N);
// Round up according to array size
gridSize = (N + blockSize - 1) / blockSize;

cout<<"blockSize: "<<blockSize<<" minGridSize: "<<minGridSize<<" gridSize: "<<gridSize<<endl;

// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);

int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);

float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);

printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);


// Run kernel on 1M elements on the GPU
add<<<gridSize, blockSize>>>(N, d_x, d_y);

// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();

// Check for errors (all values should be 3.0f)
float maxError = 0.0f;

for (int i = 0; i < N; i++) {
maxError = fmax(maxError, fabs(d_y[i]-3.0f));
}
std::cout << "Max error: " << maxError << std::endl;

// Free memory
cudaFree(d_x);
cudaFree(d_y);

return 0;
}


blockSize: 1024 minGridSize: 16 gridSize: 1
Launched blocks of size 1024. Theoretical occupancy: 1.000000
Segmentation fault (core dumped)

最佳答案

问题出在这里:

for (int i = 0; i < N; i++) {
maxError = fmax(maxError, fabs(d_y[i]-3.0f));
^^^^^^
}

原因是您不能取消引用主机上的设备指针。

解决方案是将设备内存复制到主机,类似于将主机复制到设备。

关于c++ - 使用 cudaMemcpy 时出现段错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58402191/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com