gpt4 book ai didi

parallel-processing - 使用 cudaMallocManaged 时不允许从全局函数调用 __host__ 函数

转载 作者:行者123 更新时间:2023-12-04 03:26:17 26 4
gpt4 key购买 nike

我有一个书面代码,我试图修改它以使其使用 CUDA,但我遇到了很多麻烦,目前,我试图使我想成为内核函数的函数无效,但我得到了一些错误

这是我收到的错误列表:

black_scholes.cu(54): error: calling a __host__ function("cudaMallocManaged<double> ") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(54): error: identifier "cudaMallocManaged<double> " is undefined in device code

black_scholes.cu(56): error: calling a __host__ function("init_gaussrand_state") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(56): error: identifier "init_gaussrand_state" is undefined in device code

black_scholes.cu(65): error: calling a __host__ function("spawn_prng_stream") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(65): error: identifier "spawn_prng_stream" is undefined in device code

black_scholes.cu(66): error: calling a __host__ function("gaussrand1") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(66): error: identifier "gaussrand1" is undefined in device code

black_scholes.cu(66): error: identifier "uniform_random_double" is undefined in device code

black_scholes.cu(73): error: calling a __host__ function("free_prng_stream") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(73): error: identifier "free_prng_stream" is undefined in device code

black_scholes.cu(74): error: calling a __host__ function("cudaFree") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(74): error: identifier "cudaFree" is undefined in device code

我特别发布了前 2 个错误,因为在通过 Nvidia 入门类(class)学习 CUDA 时,通常会在 __global__ 函数中调用 cudaMallocManaged 而我不会看不到这里有什么不同

这是我的 .cu 代码:

#include "black_scholes.h"
#include "gaussian.h"
#include "random.h"
#include "util.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

__managed__ double stddev;

__global__ void black_scholes_stddev (void* the_args)
{

black_scholes_args_t* args = (black_scholes_args_t*) the_args;
const double mean = args->mean;
const int M = args->M;
double variance = 0.0;
int k = blockIdx.x * blockDim.x + threadIdx.x;

if(k<M)
{
const double diff = args->trials[k] - mean;
variance += diff * diff / (double) M;
}

args->variance = variance;
stddev=sqrt(variance);

}


__global__ void black_scholes_iterate (void* the_args)
{

black_scholes_args_t* args = (black_scholes_args_t*) the_args;

const int S = args->S;
const int E = args->E;
const int M = args->M;
const double r = args->r;
const double sigma = args->sigma;
const double T = args->T;

double* trials = args->trials;
double mean = 0.0;

gaussrand_state_t gaussrand_state;
void* prng_stream = NULL;

double *randnumbs;
cudaMallocManaged(&randnumbs, M * sizeof (double));

init_gaussrand_state (&gaussrand_state);

int i = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.x * blockDim.x + threadIdx.x;


//for (int i = 0; i < M; i++)
if(i<M)
{
prng_stream = spawn_prng_stream(i%4);
const double gaussian_random_number = gaussrand1 (&uniform_random_double, prng_stream, &gaussrand_state);
randnumbs[i]=gaussian_random_number;
const double current_value = S * exp ( (r - (sigma*sigma) / 2.0) * T + sigma * sqrt (T) * randnumbs[k]);
trials[k] = exp (-r * T) * ((current_value - E < 0.0) ? 0.0 : current_value - E);
mean += trials[k] / (double) M;//needs to be shared
args->mean = mean;
}
free_prng_stream (prng_stream);
cudaFree(randnumbs);
}



void black_scholes (confidence_interval_t* interval,
const double S,
const double E,
const double r,
const double sigma,
const double T,
const int M,
const int n)
{
black_scholes_args_t args;
double mean = 0.0;
double conf_width = 0.0;
double* trials = NULL;

assert (M > 0);
trials = (double*) malloc (M * sizeof (double));
assert (trials != NULL);

args.S = S;
args.E = E;
args.r = r;
args.sigma = sigma;
args.T = T;
args.M = M;
args.trials = trials;
args.mean = 0.0;
args.variance = 0.0;

(void)black_scholes_iterate<<<1,1>>>(&args);
mean = args.mean;
black_scholes_stddev<<<1,1>>> (&args);
cudaDeviceSynchronize();

conf_width = 1.96 * stddev / sqrt ((double) M);
interval->min = mean - conf_width;
interval->max = mean + conf_width;

deinit_black_scholes_args (&args);
}


void deinit_black_scholes_args (black_scholes_args_t* args)
{
if (args != NULL)
if (args->trials != NULL)
{
free (args->trials);
args->trials = NULL;
}
}

如果能帮助理解正在发生的事情,我们将不胜感激,这似乎是一个反复出现的主题。

最佳答案

目前,无法在 CUDA 设备代码中调用 cudaMallocManaged。这是不可能的。我不相信有 NVIDIA 培训 Material 演示如何在设备代码中使用 cudaMallocManaged

如果您希望在内核中进行分配,我建议使用 the programming guide 中描述的方法.此外,newdelete 的工作方式类似于 malloc()free(),用于在内核中使用。

关于parallel-processing - 使用 cudaMallocManaged 时不允许从全局函数调用 __host__ 函数,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/67521866/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com