gpt4 book ai didi

c - 在 CUDA C 中取消引用指针

转载 作者:太空宇宙 更新时间:2023-11-04 07:22:20 25 4
gpt4 key购买 nike

我是一名 C 程序员新手,对这个段错误有点困惑。我以前使用过指针,这没有意义。这段代码是在 NVIDIA GPU 上完成的,但我还没有使用任何 CUDA API 函数(将它们注释掉以隔离错误)。

在函数校准中取消引用 GPU 上的指针 *mu(参见下面的代码)时出现错误。也就是说,该错误是一个段错误。

我的主机代码是:

/******************************************************************************
*cr
*cr
******************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include "kernel.cu"
#include "support.h"

int main (int argc, char *argv[])
{

Timer timer;
cudaError_t cuda_ret;

// Initialize host variables ----------------------------------------------

printf("\nSetting up the problem...\n"); fflush(stdout);
startTime(&timer);

double* A_h, *T_h, *Delta_h, *E_h, *p_h, *p2_h, *D_h, *Times_h, *ones_h;
double* A_d, *T_d, *Delta_d, *E_d, *p_d, *p2_d, *D_d, *Times_d, *ones_d, *temp_1, *temp_2;
double* mu_h, *alpha_h, *omega_h;
double* mu_d, *alpha_d, *omega_d;
int N;
unsigned int mat_size, vec_size;

// Import data
FILE *fp;
char str[60];
unsigned int count=0;
double d;

/* opening file for reading */
fp = fopen("AAPL_data.txt","r");

if(fp == NULL) {
perror("Error opening file");
return(-1);
}
while(fgets (str, 60, fp)!=NULL)
++count;

// Stick with a limited subset of the data for now
N = 2000;

fclose(fp);
printf("Count is %u \n",count);

mat_size = N*N;
vec_size = N;

dim3 dim_grid, dim_block;

// Fill matrices with 0's
A_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { A_h[i] = 0; }

T_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { T_h[i] = 0; }

Delta_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { Delta_h[i] = 0; }

E_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { E_h[i] = 0; }

p_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { p_h[i] = 0; }

// Fill vectors with 0's, except the 1's vector
p2_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { p2_h[i] = 0; }

Times_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { Times_h[i] = 0; }

D_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { D_h[i] = 0; }

ones_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { ones_h[i] = 0; }

// Start constants as zero
mu_h = (double*) malloc( sizeof(double));
alpha_h = (double*) malloc( sizeof(double));
omega_h = (double*) malloc( sizeof(double));
*mu_h = 0;
*alpha_h = 0;
*omega_h = 0;

// Import data
count=0;

/* opening file for reading */
fp = fopen("AAPL_data.txt","r");

if(fp == NULL) {
perror("Error opening file");
return(-1);
}
while(fgets (str, 60, fp)!=NULL)
{
sscanf(str, "%lf", &d);
if(count < vec_size)
Times_h[count] = d;
++count;
}
fclose(fp);


/*printf("TIMES VECTOR: \n");
for (unsigned int i=0; i < vec_size; ++i)
{
printf("TIMES_H[ %u ] is ",i);
printf("%f \n", Times_h[i]);
}*/

printf("Count is %u \n",count);
stopTime(&timer); printf("%f s\n", elapsedTime(timer));

// Allocate device variables ----------------------------------------------

printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);

cudaMalloc((void**) &A_d, mat_size*sizeof(double)); // Create device variable for matrix A
cudaMalloc((void**) &T_d, mat_size*sizeof(double)); // Create device variable for matrix T
cudaMalloc((void**) &Delta_d, mat_size*sizeof(double)); // Create device variable for matrix Delta
cudaMalloc((void**) &E_d, mat_size*sizeof(double)); // Create device variable for matrix E
cudaMalloc((void**) &p_d, mat_size*sizeof(double)); // Create device variable for matrix p
cudaMalloc((void**) &p2_d, vec_size*sizeof(double)); // Create device variable for vector p2
cudaMalloc((void**) &D_d, vec_size*sizeof(double)); // Create device variable for vector D
cudaMalloc((void**) &Times_d, vec_size*sizeof(double)); // Create device variable for vector Times
cudaMalloc((void**) &ones_d, vec_size*sizeof(double)); // Create device variable for vector ones
cudaMalloc((void**) &mu_d, sizeof(double)); // Create device variable for constant mu
cudaMalloc((void**) &alpha_d, sizeof(double)); // Create device variable for constant alpha
cudaMalloc((void**) &omega_d, sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &temp_1, vec_size*sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &temp_2, mat_size*sizeof(double)); // Create device variable for constant omega

cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));

// Copy host variables to device ------------------------------------------

printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);

cudaMemcpy(A_d,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(T_d,T_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(Delta_d,Delta_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(E_d,E_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(p_d,p_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(p2_d,p2_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(D_d,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(ones_d,ones_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(Times_d,Times_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(mu_d,mu_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(alpha_d,alpha_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(omega_d,omega_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var

cudaMemcpy(temp_1,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(temp_2,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var


cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));

// Launch kernel using standard sgemm interface ---------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);

int MAX_ITER = 100;
double TOL = .001;

calibrate(vec_size,mu_d, alpha_d, omega_d, A_d, T_d, Delta_d, E_d, p_d, p2_d, D_d, ones_d, Times_d,
MAX_ITER, TOL, temp_1, temp_2);


//tiledSgemm('N', 'N', matArow, matBcol, matBrow, 1.0f, \
// A_d, matArow, B_d, matBrow, 0.0f, C_d, matBrow); // A1_d, B1_d);

cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch kernel");
stopTime(&timer); printf("%f s\n", elapsedTime(timer));

// Copy device variables from host ----------------------------------------

printf("Copying data from device to host...\n"); fflush(stdout);
startTime(&timer);


cudaMemcpy(mu_h,mu_d,sizeof(float), cudaMemcpyDeviceToHost); // Copy from device var to host var
cudaMemcpy(alpha_h,alpha_d,sizeof(float), cudaMemcpyDeviceToHost); // Copy from device var to host var
cudaMemcpy(omega_h,omega_d,sizeof(float), cudaMemcpyDeviceToHost); // Copy from device var to host var

printf("mu is %f: \n",mu_h);
printf("alpha is %f: \n",alpha_h);
printf("omega is %f: \n",omega_h);

cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));


// Free memory ------------------------------------------------------------

free(A_h);
free(T_h);
free(Delta_h);
free(E_h);
free(p_h);
free(p2_h);
free(D_h);
free(ones_h);
free(Times_h);
free(mu_h);
free(alpha_h);
free(omega_h);

cudaFree(A_d);
cudaFree(T_d);
cudaFree(Delta_d);
cudaFree(E_d);
cudaFree(p_d);
cudaFree(p2_d);
cudaFree(D_d);
cudaFree(ones_d);
cudaFree(Times_d);
cudaFree(mu_d);
cudaFree(alpha_d);
cudaFree(omega_d);

return 0;
}

GPU 上的内核代码是:

/*****************************************************************************************/
#include <stdio.h>

#define TILE_SIZE 16
#define BLOCK_SIZE 512

__global__ void mysgemm(int m, int n, int k, const double *A, const double *B, double* C) {

__shared__ float ds_A[TILE_SIZE][TILE_SIZE];
__shared__ float ds_B[TILE_SIZE][TILE_SIZE];

int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = (by*TILE_SIZE+ty);//%m;
int col = (bx*TILE_SIZE+tx);//%n;
float pvalue = 0;


for(int i=0;i<(k-1)/TILE_SIZE+1;++i)
{
if((i*TILE_SIZE +tx < k) && (row < m))
ds_A[ty][tx] = A[row*k+i*TILE_SIZE+tx];
else ds_A[ty][tx] = 0;

if((i*TILE_SIZE+ty < k) && (col < n))
ds_B[ty][tx] = B[(i*TILE_SIZE+ty)*n+col]; // Load data into shared memory
else ds_B[ty][tx] = 0;

__syncthreads();

if(row < m && col < n)
{
for(int j=0;j<TILE_SIZE;++j)
{
//if(j < k)
pvalue += ds_A[ty][j]*ds_B[j][tx];
}
}
__syncthreads();
}

if(row < m && col < n)
C[row*n+col] = pvalue;
}

// Kernel to multiply each element in A by the corresponding element in B and store
// the result to the corresponding element in C. All vectors should be of length m
__global__ void elem_mul(int m, const double *A, const double *B, double* C)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
C[i] = A[i]*B[i];
}

// Kernel for parallel sum
__global__ void reduction(double *out, double *in, unsigned size)
{
__shared__ float partialSum[2*BLOCK_SIZE];
unsigned int t = threadIdx.x;
unsigned int start = 2*blockIdx.x*blockDim.x;

if(start + t >= size)
partialSum[t] = 0;
else partialSum[t] = in[start+t];

if(start + blockDim.x+t>= size)
partialSum[blockDim.x+t] = 0;
else partialSum[blockDim.x+t] = in[start + blockDim.x+t];

for(unsigned int stride = 1; stride <=blockDim.x; stride*=2)
{
__syncthreads();
if(t % stride ==0)
partialSum[2*t]+=partialSum[2*t+stride];
}

__syncthreads();

out[blockIdx.x] = partialSum[0];
}

// Uses several kernels to compute the inner product of A and B
void inner_product(double *out, int m, const double *A, const double* B, double* temp)
{
dim3 dimGrid((m-1)/BLOCK_SIZE+1,(m-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
elem_mul<<<dimGrid,dimBlock>>>(m,A,B,temp);
reduction<<<dimGrid,dimBlock>>>(out,temp,m);
}

// Kernel to multiply each element in the matrix out in the following manner:
// out(i,j) = in(i) - in(j)
__global__ void fill(int m, const double *in, double *out)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;

int i = tx+bx*blockDim.x;
int j = ty+by*blockDim.y;

if((i < m) && (j < m))
out[i*m+j] = in[i]-in[j];
}

// Kernel to fill the matrix out with the formula out(i,j) = exp(-omega*T(i.j))
__global__ void fill_E(int m, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;

if(i < m)
out[i] = exp(-coeff * in[i]);
}

// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_mul(int m, int k, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;

if(i < m*k)
out[i] = coeff * in[i];
}

// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_add(int m, int k, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;

if(i < m*k)
out[i] = coeff + in[i];
}

// Kernel to update vector p2
__global__ void update_p2(int m, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;

if(i < m)
out[i] = coeff/in[i];
}

// Kernel to update matrix p
__global__ void update_p(int m, double* p2, double *denom, double *num, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;

// loop through columns j
for(int j=0; j<m; ++j)
{
if(i == j)
out[i*m + j] = p2[i];
else if(i < m)
out[i*m + j] = num[i*m+j]/denom[i];
}
}


/*****************************************************************************************/
// int size: length of the Time-series vectors. Also the number of rows and columns in input matrices
// double mu: One of three parameters calibrated
// double alpha: One of three parameters calibrated
// double omega: One of three parameters calibrated
// double* A: A matrix filled out and used to calibrate
// double* T: A distance matrix T(i,j) = Times[i]-Times[j]
// double* Delta: A dissimilarity matrix Delta(i,j) = 1 if i > j, 0 otherwise
// double* E: A matrix filled out and used to calibrate--E(i,j) = exp(-omega*T(i,j))
// double* p: A probability matrix of cross excitations
// double* p2: A vector of self-excitation probabilities
// double* ones: A (size x 1) vector of 1's used in inner products and identity transformations
// double* Times: A (size x 1) vector of time series data to be calibrated
// int MAX_ITER: The maximum number of iterations allowed in the calibration
// double* TOL: The error tolerance or accuracy allowed in the calibration
// double* temp_1: A (size x 1) temporary vector used in intermediate calculations
// double* temp_2: A temporary matrix used in intermediate calculations
/*****************************************************************************************/
void calibrate(int size, double *mu, double *alpha, double *omega, double *A, double *T, double *Delta, double *E, double *p, double *p2, double *D, double* ones, double *Times, int MAX_ITER, double TOL, double* temp_1, double* temp_2)
{

//1) (a) Perform inner product to start initial values of mu, alpha, and omega
*mu = .11; // ERROR IS HERE!!
/*
inner_product(mu, size, Times, ones, temp_1);

double a = *(mu);
a = a/size;
*mu = .11;

/*
/size;
*alpha = *mu;
*omega = *mu;


double mu_t = 0;
double alpha_t = 0;
double omega_t = 0;
double err = 0;
int ctr = 0;

//1) (b) Fill out matrix T of time differences
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
fill<<<dimGrid,dimBlock>>>(size, Times, T);


while(ctr < MAX_ITER && err < TOL)
{
// 2) Fill out matrix E
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
fill_E<<<dimGrid,dimBlock>>>(size, omega, T, E);

// 3) Update matrix A
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
scal_mult<<<dimGrid,dimBlock>>>(size,size, alpha, delta, A);
scal_mult<<<dimGrid,dimBlock>>>(size,size, omega, A, A);

dim3 dimGrid((n-1)/TILE_SIZE+1,(m-1)/TILE_SIZE+1,1);
dim3 dimBlock(TILE_SIZE,TILE_SIZE,1);
mysgemm<<<dimGrid,dimBlock>>>(size,size,size,A,E,A)


// 4) Update matrix D
mysgemm<<<dimGrid,dimBlock>>>(size,size,1,A,ones,D);
scal_add<<<dimGrid,dimBlock>>>(size,size, mu, D, D);

// 5) Update matrix p and vector p2
update_p2<<<dimGrid,dimBlock>>>(size,mu, D, p2);
update_p<<<dimGrid,dimBlock>>>(size,p2, D, A, p);

// 6) Update parameters mu, alpha, omega
inner_product(mu_t, size, p2, ones, temp_1);
mu_t /=Times[size-1];

reduction<<<dimGrid,dimBlock>>>(alpha_t,p,size*size);
alpha_t/= size;

// Treat T and p as very long vectors and calculate the inner product
inner_product(omega_t, size*size, T, p, temp_2);
omega_t = alpha_t/omega_t;

// 7) Update error
ctr++;
err = (mu - mu_t)*(mu - mu_t) + (alpha-alpha_t)*(alpha-alpha_t) + (omega-omega_t)*(omega-omega_t);
mu = mu_t;
alpha = alpha_t;
omega = omega_t;

cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
printf("CUDA error: %s\n",cudaGetErrorString(error));
exit(-1);
}
}
*/
}

但是,我认为这段代码的 99% 与问题无关(目前我没有使用“support.h”中的任何内容。基本上,我在取消引用 GPU 上的指针时遇到错误,即使它可能不为空。谢谢!

最佳答案

如果你这样做 proper cuda error checking您会发现代码的另一个问题,这一行:

cudaMemcpy(Times_d,Times_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); 

应该是这样的:

cudaMemcpy(Times_d,Times_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); 

但这不是问题的关键。我花了一段时间才弄清楚您没有进行任何内核调用。如果您调用内核,则您传递给该内核的所有参数都必须可供设备访问。所以如果你传递一个指针,这个指针必须指向设备内存。您正在使用 mu_d 执行此操作,它是一个设备指针:

calibrate(vec_size,mu_d,...

但是您的calibrate 不是内核!!

它是一个运行在主机(CPU)上的普通主机函数。因此,当您尝试在主机代码中取消引用设备指针 mu_d 时:

*mu = .11; // ERROR IS HERE!!

你遇到段错误。我不确定您为什么要尝试以这种方式进行调试,但简单地将内核调用转换为主机例程,同时保持所有参数不变,并不是一种有效的调试方式。

基本 CUDA 规则(忽略 cuda 6 统一内存):

  1. 您不能在设备代码中取消引用主机指针
  2. 您不能在主机代码中取消引用设备指针

您的代码违反了上面的第二条规则。

关于c - 在 CUDA C 中取消引用指针,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/20607546/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com