gpt4 book ai didi

c++ - 在 CUDA 内核中传递常量整数

转载 作者:行者123 更新时间:2023-11-28 05:05:46 24 4
gpt4 key购买 nike

<分区>

我在使用以下代码时遇到问题。在全局内核 loop_d 中,M 的整数值为 84。当我尝试创建共享数组 temp 并使用 M 作为数组的大小时,出现以下错误:

错误:表达式必须有一个常量值

我不确定为什么会这样。我知道如果我将 M 声明为全局变量,那么它就可以工作,但问题是我通过在不同的 Fortran 程序中调用函数 d_two 来获取 M 的值,所以我不确定如何解决这个问题。我知道如果我用 temp[84] 替换 temp[M],那么我的程序运行完美,但这不是很实用,因为不同的问题可能有不同的 M 值。谢谢你的帮助!

程序

// Parallelized 2D Three-Point Guassian Quadrature Numerical Integration Method
// The following program is part of two linked programs, Integral_2D_Cuda.f.
// This is a CUDA kernel that could be called in the Integral_2D_Cuda.f Fortran code to compute
// the integral of a given 2D-function
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
// The following is a definition for the atomicAddd function that is called in the loop_d kernel
// This is needed because the "regular" atomicAdd function only works for floats and integers
__device__ double atomicAddd(double* address, double val)
{
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
// GPU kernel that computes the function of interest. This is good for a two dimensional problem.
__global__ void loop_d(double *a_sx, double *b_swx, double *c_sy, double *d_swy, double *e_ans0, int N, int M)
{
// Declaring a shared array that threads of the same block have access to
__shared__ double temp[M];
int idxX = blockIdx.x * blockDim.x + threadIdx.x; // Thread indices responsible for the swx and sx arrays
int idxY = threadIdx.y; // Thread indices responsible for the swy and sy arrays
// Computing the multiplication of elements
if (idxX < N && idxY < M)
{
temp[idxY] = a_sx[idxX] * b_swx[idxX] * c_sy[idxY] * d_swy[idxY];
}
// synchronizing all threads before summing all the mupltiplied elements int he temp array
__syncthreads();
// Allowing the 0th thread of y to do the summation of the multiplied elements in the temp array of one block
if (0 == idxY)
{
double sum = 0.00;
for(int k = 0; k < M; k++)
{
sum = sum + temp[k];
}
// Adding the result of this instance of calculation to the final answer, ans0
atomicAddd(e_ans0, sum);
}
}
extern "C" void d_two_(double *sx, double *swx, int *nptx, double *sy, double *swy, int *npty, double *ans0)
{
// Assigning GPU pointers
double *sx_d, *swx_d;
int N = *nptx;
double *sy_d, *swy_d;
int M = *npty;
double *ans0_d;
dim3 threadsPerBlock(1,M); // Creating a two dimesional block with 1 thread in the x dimesion and M threads in the y dimesion
dim3 numBlocks(N); // specifying the number of blocks to use of dimesion 1xM
// Allocating GPU Memory
cudaMalloc( (void **)&sx_d, sizeof(double) * N);
cudaMalloc( (void **)&swx_d, sizeof(double) * N);
cudaMalloc( (void **)&sy_d, sizeof(double) * M);
cudaMalloc( (void **)&swy_d, sizeof(double) * M);
cudaMalloc( (void **)&ans0_d, sizeof(double) );
// Copying information fromm CPU to GPU
cudaMemcpy( sx_d, sx, sizeof(double) * N, cudaMemcpyHostToDevice );
cudaMemcpy( swx_d, swx, sizeof(double) * N, cudaMemcpyHostToDevice );
cudaMemcpy( sy_d, sy, sizeof(double) * M, cudaMemcpyHostToDevice );
cudaMemcpy( swy_d, swy, sizeof(double) * M, cudaMemcpyHostToDevice );
cudaMemcpy( ans0_d, ans0, sizeof(double), cudaMemcpyHostToDevice );
// Calling the function on the GPU
loop_d<<< numBlocks, threadsPerBlock >>>(sx_d, swx_d, sy_d, swy_d, ans0_d, N, M);
// Copying from GPU to CPU
cudaMemcpy( ans0, ans0_d, sizeof(double), cudaMemcpyDeviceToHost );
// freeing GPU memory
cudaFree(sx_d);
cudaFree(swx_d);
cudaFree(sy_d);
cudaFree(swy_d);
cudaFree(ans0_d);
return;
}

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com