gpt4 book ai didi

Cuda 矩阵乘法

转载 作者:行者123 更新时间:2023-12-04 18:22:41 25 4
gpt4 key购买 nike

我正在尝试在 cuda 中编写矩阵乘法代码,这与 Nvidia 的 cuda 编程指南非常相似,但它不起作用。它应该做 C=alpha*A*B+beta*C ,但对于每个 A,B C 保持不变。

__global__ void MatMulKernel(int m,int n,int k,double *A,double *B,double *C,double alpha,double beta)
{
double Ctemp = 0.0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int ind;
for (ind = 0; ind < k; ++ind)
{
Ctemp += A[row+ind*m]*B[ind+col*k];
}

C[row+m*col] = alpha*Ctemp+beta*C[row+m*col];
//C[row+m*col] = Ctemp;
__syncthreads();
}

extern "C" void
local_mm_cuda (const int m, const int n, const int k, const double alpha,
const double *A, const int lda, const double *B, const int ldb,
const double beta, double *C, const int ldc)
{

int row, col;

/* Verify the sizes of lda, ldb, and ldc */
assert (lda >= m);
assert (ldb >= k);
assert (ldc >= m);

// allocating memory for device array
double *dA,*dB,*dC;
size_t sizeA = sizeof(double)*m*k;
size_t sizeB = sizeof(double)*n*k;
size_t sizeC = sizeof(double)*m*n;

cudaMalloc((void**)&dA,sizeA);
cudaMalloc((void**)&dB,sizeB);
cudaMalloc((void**)&dC,sizeC);

cudaMemcpy(dA, A, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(dB, B, sizeB, cudaMemcpyHostToDevice);
cudaMemcpy(dC, C, sizeC, cudaMemcpyHostToDevice);

// calling matrix multiplication kernal
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid( n/dimBlock.x, m/dimBlock.y);
MatMulKernel<<<dimGrid, dimBlock>>>(m,n,k,dA,dB,dC,alpha,beta);
cudaThreadSynchronize();

// saving C calculated back in C
cudaMemcpy(dC,C, sizeC,cudaMemcpyDeviceToHost);
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
}

最佳答案

尝试修改

"dim3 dimGrid( n/dimBlock.x, m/dimBlock.y);"


"dim3 dimGrid( (n+dimBlock.x-1)/dimBlock.x, (m+dimBlock.y-1)/dimBlock.y); "

关于Cuda 矩阵乘法,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/10327726/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com