c++ - __device__ 类成员函数更改设备变量的值后从设备复制到主机时出现 cudaMemcpy 错误-6ren

c++ - device 类成员函数更改设备变量的值后从设备复制到主机时出现 cudaMemcpy 错误

转载作者：行者123 更新时间：2023-11-30 02:26:27

我对我编写的 CUDA 代码的行为感到困惑。我正在为名为 DimmedGridGPU 的类中的 __device__ 函数编写测试。此类以 int DIM 为模板，我遇到的函数是返回距离输入值 x 最近的网格值。我有这个内核命名空间用于单元测试目的，以隔离调用每个 __device__ 函数。

此代码的预期行为是从 do_get_value(x, grid_) 调用返回值 3.0，并设置 d_target[0] 到此值，然后将其传回主机端以进行单元测试断言。整个内核似乎运行正常，但是当我将最终传输回主机端时，我收到一个 cudaErrorInvalidValue 错误，我不明白为什么。

这是代码的一个最小示例，保留了类的结构及其特性:

#include <cuda_runtime.h>
#include <fstream>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
     fprintf(stderr,"GPUassert: \"%s\": %s %s %d\n", cudaGetErrorName(code), cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}


template <int DIM>
class DimmedGridGPU{

public:
  size_t grid_size_;//total size of grid
  int b_derivatives_;//if derivatives are going to be used
  int b_interpolate_;//if interpolation should be used on the grid
  double* grid_;//the grid values
  double* grid_deriv_;//derivatives    
  double dx_[DIM];//grid spacing
  double min_[DIM];//grid minimum
  double max_[DIM];//maximum
  int grid_number_[DIM];//number of points on grid
  int b_periodic_[DIM];//if a dimension is periodic
  int* d_b_interpolate_;
  int* d_b_derivatives_;


  DimmedGridGPU(const double* min, 
        const double* max, 
        const double* bin_spacing, 
        const int* b_periodic, 
        int b_derivatives, 
        int b_interpolate) :   b_derivatives_(b_derivatives), b_interpolate_(b_interpolate), grid_(NULL), grid_deriv_(NULL){
    
    size_t i;

    for(i = 0; i < DIM; i++) {
      min_[i] = min[i];
      max_[i] = max[i];
      b_periodic_[i] = b_periodic[i];

      grid_number_[i] = (int) ceil((max_[i] - min_[i]) / bin_spacing[i]);
      dx_[i] = (max_[i] - min_[i]) / grid_number_[i];
      //add one to grid points if 
      grid_number_[i] = b_periodic_[i] ? grid_number_[i] : grid_number_[i] + 1;
      //increment dx to compensate
      if(!b_periodic_[i])
    max_[i] += dx_[i];
    }

    grid_size_ = 1;
    for(i = 0; i < DIM; i++)
      grid_size_ *= grid_number_[i];
    gpuErrchk(cudaMallocManaged(&grid_, grid_size_ * sizeof(double)));
    if(b_derivatives_) {
      gpuErrchk(cudaMallocManaged(&grid_deriv_, DIM * grid_size_ * sizeof(double)));
      if(!grid_deriv_) {
    printf("Out of memory!! gpugrid.cuh:initialize");   
      }
    }
    
    gpuErrchk(cudaMalloc((void**)&d_b_interpolate_, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_b_interpolate_, &b_interpolate, sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMalloc((void**)&d_b_derivatives_, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_b_derivatives_, &b_derivatives, sizeof(int), cudaMemcpyHostToDevice));
  }

  ~DimmedGridGPU(){
    gpuErrchk(cudaDeviceSynchronize());
    if(grid_ != NULL){
      gpuErrchk(cudaFree(grid_));
      grid_ = NULL;//need to do this so DimmedGrid's destructor functions properly
    }
    
    if(grid_deriv_ != NULL){
      gpuErrchk(cudaFree(grid_deriv_));
      grid_deriv_ = NULL;
    }
      
    gpuErrchk(cudaDeviceReset());
  }
//gets the value of the grid closest to x
  __host__ __device__ double do_get_value( double* x, double* grid_) {

    size_t index[DIM];
    get_index(x, index);
    printf("do_get_value was called on the GPU!, and index[0] is now %d\n", index[0]);
    printf("but multi2one(index) gives us %d\n", multi2one(index));
    double value = grid_[multi2one(index)];
    printf("and value to be returned is %f\n", value);
    return value;
  }
//gets grid's 1D index from an array of coordinates
   __host__ __device__ void get_index(const double* x, size_t result[DIM]) const {
    size_t i;
    double xi;
    printf("get_index was called on the GPU in %i dimension(s)\n", DIM);
    for(i = 0; i < DIM; i++) {
      xi = x[i];
      printf("xi is now %f, min_[i] is %f and dx_[i] is %f\n",xi, min_[i], dx_[i]);
      if(b_periodic_[i]){
    xi -= (max_[i] - min_[i]) * gpu_int_floor((xi - min_[i]) / (max_[i] - min_[i]));
      }
      result[i] = (size_t) floor((xi - min_[i]) / dx_[i]);
    }
  }
//takes a multidimensional index to a 1D index
  __host__ __device__ size_t multi2one(const size_t index[DIM]) const {
    size_t result = index[DIM-1];

    size_t i;    
    for(i = DIM - 1; i > 0; i--) {
      result = result * grid_number_[i-1] + index[i-1];
    }
    
    return result;
    
  }

};

__host__ __device__ int gpu_int_floor(double number) {
  return (int) number < 0.0 ? -ceil(fabs(number)) : floor(number);
}


namespace kernels{
  template <int DIM>
  __global__ void get_value_kernel(double* x, double* target_arr, double* grid_, DimmedGridGPU<DIM>  g){
    target_arr[0] = g.do_get_value(x, grid_);
    printf("get_value_kernel has set target[0] to be %f\n", target_arr[0]);//check if the value is set correctly
    return;
  }
}


int main(){
  using namespace kernels;
  double min[] = {0};
  double max[] = {10};
  double bin_spacing[] = {1};
  int periodic[] = {0};
  DimmedGridGPU<1> g (min, max, bin_spacing, periodic, 0, 0);
  for(int i = 0; i < 11; i++){
    g.grid_[i] = i;
    printf("g.grid_[%d] is now %f\n", i, g.grid_[i]);
  }
  gpuErrchk(cudaDeviceSynchronize());
  double x[] = {3.5};
  
  double* d_x;
  gpuErrchk(cudaMalloc(&d_x, sizeof(double)));
  gpuErrchk(cudaMemcpy(d_x, x, sizeof(double), cudaMemcpyHostToDevice));
  double target[] = {5.0};
  double* d_target;
  gpuErrchk(cudaMalloc((void**)&d_target, sizeof(double)));
  gpuErrchk(cudaMemcpy(d_target, target, sizeof(double), cudaMemcpyHostToDevice));
  gpuErrchk(cudaDeviceSynchronize());
  get_value_kernel<1><<<1,1>>>(d_x, d_target, g.grid_, g);
  gpuErrchk(cudaDeviceSynchronize());
  gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));
  printf("and after GPU stuff, target[0] is now %f\n", target[0]);
  return(0);
}

那么，为什么这一行(最后一个 cudaMemcpy)抛出错误“CudaErrorInvalidValue”，而我包含的打印语句清楚地表明正在输入正确的值在设备上使用，do_get_value(x, grid_) 调用返回的值是否正确？

我已经尝试过使用 cudaMemcpyFromSymbol，认为赋值可能是创建一个符号而不是以某种方式传递和更改值，但事实并非如此，因为 d_target 不是有效符号。

这是我的代码的示例输出:

g.grid_[0] is now 0.000000

g.grid_[1] is now 1.000000

g.grid_[2] is now 2.000000

g.grid_[3] is now 3.000000

g.grid_[4] is now 4.000000

g.grid_[5] is now 5.000000

g.grid_[6] is now 6.000000

g.grid_[7] is now 7.000000

g.grid_[8] is now 8.000000

g.grid_[9] is now 9.000000

g.grid_[10] is now 10.000000

get_index was called on the GPU in 1 dimension(s)

xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000

do_get_value was called on the GPU!, and index[0] is now 3

but multi2one(index) gives us 3

and value to be returned is 3.000000

get_value_kernel has set target[0] to be 3.000000

GPUassert: "cudaErrorInvalidValue": invalid argument gpugrid.cu 166

最佳答案

So, why does this line (the last cudaMemcpy) throw an error "CudaErrorInvalidValue"...?

问题围绕着你的析构函数:

  ~DimmedGridGPU(){

析构函数在您可能意想不到的地方被调用。要让自己相信这一点，请向析构函数添加 printf 语句。注意它在打印输出中出现的位置:

$ ./t955
g.grid_[0] is now 0.000000
g.grid_[1] is now 1.000000
g.grid_[2] is now 2.000000
g.grid_[3] is now 3.000000
g.grid_[4] is now 4.000000
g.grid_[5] is now 5.000000
g.grid_[6] is now 6.000000
g.grid_[7] is now 7.000000
g.grid_[8] is now 8.000000
g.grid_[9] is now 9.000000
g.grid_[10] is now 10.000000
Destructor!
get_index was called on the GPU in 1 dimension(s)
xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000
do_get_value was called on the GPU!, and index[0] is now 3
but multi2one(index) gives us 3
and value to be returned is 3.000000
get_value_kernel has set target[0] to be 3.000000
GPUassert: "cudaErrorInvalidValue": invalid argument t955.cu 167

鉴于此，很明显在该析构函数中调用 cudaDeviceReset() 现在似乎是个坏主意。 cudaDeviceReset() 会清除所有设备分配，因此当您尝试执行此操作时:

gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));

d_target 不再是设备上的有效分配，因此当您尝试将其用作 cudaMemcpy 的设备目标时，运行时会检查此指针值(未被设备重置更改)并确定指针值不再对应于有效分配，并引发运行时错误。

Just like in C++当您将对象作为按值传递参数传递给函数(在本例中为内核)时，将调用该对象的复制构造函数。当对象拷贝超出范围时，这是理所当然的，the destructor for it will be called .

我建议将诸如 cudaDeviceReset() 之类的影响全局范围的函数放在对象析构函数中可能是一种脆弱的编程范例，但这可能是一个见仁见智的问题。我假设您现在有足够的信息来解决问题。

为了避免下一个可能的问题，简单地注释掉析构函数中对 cudaDeviceReset() 的调用可能不足以使所有问题消失(尽管这个特定的问题会)。既然您知道在该程序的正常执行过程中至少两次调用了此析构函数，您可能需要仔细考虑该析构函数中还发生了什么，并可能剥离更多内容它，或者完全重新构建你的类。

例如，请注意 cudaDeviceReset() 并不是唯一可能在以这种方式使用的对象的析构函数中引起问题的函数。类似地，cudaFree() 在调用对象拷贝的析构函数中使用时，可能会对原始对象产生意想不到的后果。

关于c++ - __device__ 类成员函数更改设备变量的值后从设备复制到主机时出现 cudaMemcpy 错误，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/42844997/

文章推荐： java - 如何根据日志级别将tinylog日志写入不同的文件？

文章推荐： android - Admob 横幅广告无法在 Android 上正常运行

文章推荐： C++ 我需要为嵌套类定义 header 保护吗？

详解C语言sscanf()函数、vsscanf()函数、vscanf()函数
C语言sscanf()函数：从字符串中读取指定格式的数据头文件： ?
php - 如何解释at()函数； substr()函数;伪代码中的exist()函数
最近，我有一个关于工作预评估的问题，即使查询了每个功能的工作原理，我也不知道如何解决。这是一个伪代码。下面是一个名为foo()的函数，该函数将被传递一个值并返回一个值。如果将以下值传递给foo函数，
VBS教程：函数-CStr 函数
CStr 函数返回表达式，该表达式已被转换为 String 子类型的 Variant。 CStr(expression) expression 参数是任意有效的表达式。说明通常，可以
VBS教程：函数-CSng 函数
CSng 函数返回表达式，该表达式已被转换为 Single 子类型的 Variant。 CSng(expression) expression 参数是任意有效的表达式。说明通常，可
VBS教程：函数-CreateObject 函数
CreateObject 函数创建并返回对 Automation 对象的引用。 CreateObject(servername.typename [, location]) 参数 serv
VBS教程：函数-Cos 函数
Cos 函数返回某个角的余弦值。 Cos(number) number 参数可以是任何将某个角表示为弧度的有效数值表达式。说明 Cos 函数取某个角并返回直角三角形两边的比值。此比值是
VBS教程：函数-CLng 函数
CLng 函数返回表达式，此表达式已被转换为 Long 子类型的 Variant。 CLng(expression) expression 参数是任意有效的表达式。说明通常，您可以使
VBS教程：函数-CInt 函数
CInt 函数返回表达式，此表达式已被转换为 Integer 子类型的 Variant。 CInt(expression) expression 参数是任意有效的表达式。说明通常，可
VBS教程：函数-Chr 函数
Chr 函数返回与指定的 ANSI 字符代码相对应的字符。 Chr(charcode) charcode 参数是可以标识字符的数字。说明从 0 到 31 的数字表示标准的不可打印的
VBS教程：函数-CDbl 函数
CDbl 函数返回表达式，此表达式已被转换为 Double 子类型的 Variant。 CDbl(expression) expression 参数是任意有效的表达式。说明通常，您可
VBS教程：函数-CDate 函数
CDate 函数返回表达式，此表达式已被转换为 Date 子类型的 Variant。 CDate(date) date 参数是任意有效的日期表达式。说明 IsDate 函数用于判断 d
VBS教程：函数-CCur 函数
CCur 函数返回表达式，此表达式已被转换为 Currency 子类型的 Variant。 CCur(expression) expression 参数是任意有效的表达式。说明通常，
VBS教程：函数-CByte 函数
CByte 函数返回表达式，此表达式已被转换为 Byte 子类型的 Variant。 CByte(expression) expression 参数是任意有效的表达式。说明通常，可以
VBS教程：函数-CBool 函数
CBool 函数返回表达式，此表达式已转换为 Boolean 子类型的 Variant。 CBool(expression) expression 是任意有效的表达式。说明如果 ex
VBS教程：函数-Atn 函数
Atn 函数返回数值的反正切值。 Atn(number) number 参数可以是任意有效的数值表达式。说明 Atn 函数计算直角三角形两个边的比值 (number) 并返回对应角的弧
VBS教程：函数-Asc 函数
Asc 函数返回与字符串的第一个字母对应的 ANSI 字符代码。 Asc(string) string 参数是任意有效的字符串表达式。如果 string 参数未包含字符，则将发生运行时错误。
VBS教程：函数-Array 函数
Array 函数返回包含数组的 Variant。 Array(arglist) arglist 参数是赋给包含在 Variant 中的数组元素的值的列表（用逗号分隔）。如果没有指定此参数，则
VBS教程：函数-Abs 函数
Abs 函数返回数字的绝对值。 Abs(number) number 参数可以是任意有效的数值表达式。如果 number 包含 Null，则返回 Null；如果是未初始化变量，则返回 0。
VBS教程：函数-FormatPercent 函数
FormatPercent 函数返回表达式，此表达式已被格式化为尾随有 % 符号的百分比（乘以 100 ）。 FormatPercent(expression[,NumDigitsAfterD
VBS教程：函数-FormatNumber 函数
FormatNumber 函数返回表达式，此表达式已被格式化为数值。 FormatNumber( expression [,NumDigitsAfterDecimal [,Inc

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c++ - device 类成员函数更改设备变量的值后从设备复制到主机时出现 cudaMemcpy 错误