gpt4 book ai didi

c++ - CUDA 中的稀疏矩阵 vector 乘法

转载 作者:可可西里 更新时间:2023-11-01 17:52:31 25 4
gpt4 key购买 nike

我正在尝试在 GPU 上实现矩阵 vector 乘法(使用 CUDA)。

在我的 C++ 代码 (CPU) 中,我将矩阵加载为密集矩阵,然后使用 CUDA 执行矩阵 vector 乘法。我还使用共享内存来提高性能。

  1. 如何在知道我的矩阵是稀疏矩阵的情况下以高效方式加载矩阵?

下面是我加载矩阵的 C++ 函数:

int readMatrix( char* filename, float* &matrix, unsigned int *dim = NULL, int majority = ROW_MAJOR )
{
unsigned int w, h, x, y, num_entries;

float val;

std::ifstream file( filename );

if ( file )
{
file >> h >> w >> num_entries;
cout << w << " " << h << " " << num_entries << "\n";

assert( w == h || w == 1 || h == 1 );

if( dim != NULL ) *dim = std::max( w, h );

matrix = new float[ w * h ];

unsigned int i;
for( i = 0; i < num_entries; i++ ){

if( file.eof() ) break;

file >> y >> x >> val;

if( majority == ROW_MAJOR ){

matrix[ w * y + x ] = val;

} else if( majority == COLUMN_MAJOR ){

matrix[ h * x + y ] = val;
}
}
file.close();

if( i == num_entries )
std::cout << "\nFile read successfully\n";
else
std::cout << "\nFile read successfully but seems defective:\n num entries read = " << i << ", entries epected = " << num_entries << "\n";

// print first few elements
if( w == h ){
for( unsigned int i = 0; i < w; i++ ){

printf("\n");
for( unsigned int j = 0; j < h; j++ ){

printf("%.2f ", matrix[ j + w * i ] );
}
}
}
else{

printf("\n");
for( unsigned int j = 0; j < h; j++ ){

printf("%.2f ", matrix[ j ] );
}
}

} else {

std::cout << "Unable to open file\n";
return false;
}

return true;
}

下面是我处理矩阵 vector 乘法的 CUDA 内核函数:

__global__ void
_cl_matrix_vector_( float *A, float *b, float *x, int dim )
{
extern __shared__ float vec[];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
float temp = 0.0;
int vOffs = 0;

//load vector into shared memory
for (int i = 0; i < (dim/blockDim.x) + 1 ; ++i, vOffs+= blockDim.x) {
vec[vOffs + threadIdx.x] = b[vOffs + threadIdx.x];
}

//make sure all threads are synchronized
__syncthreads();

if (idx < dim) {
temp = 0.0;
//dot product (multiplication)
for (int i = 0; i < dim; i++){
temp += A[idx * dim + i] * vec[i];
}
x[idx] = temp;
}

}
  1. 考虑到我的矩阵是稀疏矩阵,我必须对我的 CUDA 代码进行哪些必要的更改?
  2. 我从一个论坛上发现我们也可以使用填充来优化性能,但这需要我改变读取矩阵/对矩阵排序的方式。任何想法如何以我读取矩阵和执行计算的方式实现此填充?

最佳答案

这是一篇非常古老的帖子,我想强调一下 cuSPARSE (从现在开始)使稀疏矩阵之间或稀疏矩阵与密集 vector 之间的乘法例程可用。

对于 csr格式,稀疏矩阵和密集 vector 相乘的相关例程是cusparse<t>csrmv .下面是一个完整的示例,展示了它的用法。

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>

#include "Utilities.cuh"

#include <cuda_runtime.h>
#include <cusparse_v2.h>

/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));

/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int N = 4; // --- Number of rows and columns

// --- Host side dense matrices
double *h_A_dense = (double*)malloc(N * N * sizeof(double));
double *h_x_dense = (double*)malloc(N * sizeof(double));
double *h_y_dense = (double*)malloc(N * sizeof(double));

// --- Column-major ordering
h_A_dense[0] = 0.4612; h_A_dense[4] = -0.0006; h_A_dense[8] = 0.3566; h_A_dense[12] = 0.0;
h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640; h_A_dense[9] = 0.0723; h_A_dense[13] = 0.0;
h_A_dense[2] = 0.3566; h_A_dense[6] = 0.0723; h_A_dense[10] = 0.7543; h_A_dense[14] = 0.0;
h_A_dense[3] = 0.; h_A_dense[7] = 0.0; h_A_dense[11] = 0.0; h_A_dense[15] = 0.1;

// --- Initializing the data and result vectors
for (int k = 0; k < N; k++) {
h_x_dense[k] = 1.;
h_y_dense[k] = 0.;
}

// --- Create device arrays and copy host arrays to them
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(double)));
double *d_x_dense; gpuErrchk(cudaMalloc(&d_x_dense, N * sizeof(double)));
double *d_y_dense; gpuErrchk(cudaMalloc(&d_y_dense, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_x_dense, h_x_dense, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y_dense, h_y_dense, N * sizeof(double), cudaMemcpyHostToDevice));

// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));

int nnzA = 0; // --- Number of nonzero elements in dense matrix A

const int lda = N; // --- Leading dimension of dense matrix

// --- Device side number of nonzero elements per row of matrix A
int *d_nnzPerVectorA; gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA));

// --- Host side number of nonzero elements per row of matrix A
int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA));
gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost));

printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA);
for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]);
printf("\n");

// --- Device side sparse matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A)));

int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices)));

cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices));

// --- Host side sparse matrices
double *h_A = (double *)malloc(nnzA * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));

printf("\nOriginal matrix A in CSR format\n\n");
for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n");

printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

printf("\n");
for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);

printf("\n");
for (int i = 0; i < N; ++i) printf("h_x[%i] = %f \n", i, h_x_dense[i]); printf("\n");

const double alpha = 1.;
const double beta = 0.;
cusparseSafeCall(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nnzA, &alpha, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_x_dense,
&beta, d_y_dense));

gpuErrchk(cudaMemcpy(h_y_dense, d_y_dense, N * sizeof(double), cudaMemcpyDeviceToHost));

printf("\nResult vector\n\n");
for (int i = 0; i < N; ++i) printf("h_y[%i] = %f ", i, h_y_dense[i]); printf("\n");

}

关于c++ - CUDA 中的稀疏矩阵 vector 乘法,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/5969506/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com