gpt4 book ai didi

c++ - 使用 cuda 7.0 RC 的 cusolver 计算特征值和特征向量

转载 作者:搜寻专家 更新时间:2023-10-31 02:23:47 25 4
gpt4 key购买 nike

我正在尝试使用 CUDA 7.0 RC 中发布的 cuSolver 计算最大的特征值/特征向量对。问题是我遇到了 CUSOLVER_INTERNAL_ERROR,我不知道该怎么办。

这是我手头的东西,用来调用cuda/cusparse/cusolver函数。

// my handy stuff
#define CUDA_CALL(value) do { \
cudaError_t _m_cudaStat = value; \
if (_m_cudaStat != cudaSuccess) { \
fprintf(stderr, "Error %s at line %d in file %s\N", \
cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \
exit(-1); \
}
} while(0)

#define CUSPARSE_CALL(value) do { \
cusparseStatus_t _m_status = value; \
if (_m_status != CUSPARSE_STATUS_SUCCESS){ \
fprintf(stderr, "Error %d at line %d in file %s\N", (int)_m_status, __LINE__, __FILE__); \
exit(-5); \
} \
} while(0)

#define CUSOLVER_CALL(value) do { \
cusolverStatus_t _m_status = value; \
if (_m_status != CUSOLVER_STATUS_SUCCESS){ \
fprintf(stderr, "Error %d at line %d in file %s\N", (int)_m_status, __LINE__, __FILE__); \
exit(-5); \
} \
} while(0)

这是我的代码

#include "cusparse.h"
#include "cusolverSp.h"
#include <cuda_runtime.h>
#include <math.h>

void dpss( size_t N, double NW, double *eigenvector );

int main()
{
// parameters for generation of dpss
size_t N = 128;
double NW = 1;

double *eigenvector = NULL;
eigenvector = new double[ N*sizeof( double ) ];
dpss( N, NW, eigenvector );

return 0;
}

void dpss( size_t N, double NW, double *eigenvector )
{
// define matrix T (NxN)
double** T = new double*[ N ];
for(int i = 0; i < N; ++i)
T[ i ] = new double[ N ];

// fill in T as function of ( N, W )
// T is a tridiagonal matrix, i. e., it has diagonal, subdiagonal and superdiagonal
// the others elements are 0
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
if( j == i - 1 ) // subdiagonal
T[ i ][ j ] = ( (double)N - i )*i/2;
else if( j == i ) // diagonal
T[ i ][ j ] = pow( (double)(N-1)/2 - i, 2 )*std::cos( 2*NW/(double)N*M_PI )*( j == i );
else if( j == i + 1 ) // superdiagonal
T[ i ][ j ] = ( i + 1 )*( (double)N - 1 - i )/2*( j == i + 1 );
else // others elements
T[ i ][ j ] = 0;
}
}

// declarations needed
cusolverStatus_t statCusolver = CUSOLVER_STATUS_SUCCESS;
cusolverSpHandle_t handleCusolver = NULL;
cusparseHandle_t handleCusparse = NULL;
cusparseMatDescr_t descrA = NULL;
int *h_cooRowIndex = NULL, *h_cooColIndex = NULL;
double *h_cooVal = NULL;
int *d_cooRowIndex = NULL, *d_cooColIndex = NULL, *d_csrRowPtr = NULL;
double *d_cooVal = NULL;
int nnz;
double *h_eigenvector0 = NULL, *d_eigenvector0 = NULL, *d_eigenvector = NULL;
double max_lambda;

// define interval of eigenvalues of T
// interval is [-max_lambda,max_lambda]
max_lambda = ( N - 1 )*( N + 2 ) + N*( N + 1 )/8 + 0.25;

// amount of nonzero elements of T
nnz = 3*N - 2;

// allocate host memory
h_cooRowIndex = new int[ nnz*sizeof( int ) ];
h_cooColIndex = new int[ nnz*sizeof( int ) ];
h_cooVal = new double[ nnz*sizeof( double ) ];
h_eigenvector0 = new double[ N*sizeof( double ) ];

// fill in vectors that describe T as a sparse matrix
int counter = 0;
for (int i = 0; i < N; i++ ) {
for( int j = 0; j < N; j++ ) {
if( T[ i ][ j ] != 0 ) {
h_cooColIndex[counter] = j;
h_cooRowIndex[counter] = i;
h_cooVal[counter++] = T[ i ][ j ];
}
}
}

// fill in initial eigenvector guess
for( int i = 0; i < N; i++ )
h_eigenvector0[ i ] = (double)1/(i+1);

// allocate device memory
CUDA_CALL( cudaMalloc((void**)&d_cooRowIndex,nnz*sizeof( int )) );
CUDA_CALL( cudaMalloc((void**)&d_cooColIndex,nnz*sizeof( int )) );
CUDA_CALL( cudaMalloc((void**)&d_cooVal, nnz*sizeof( double )) );
CUDA_CALL( cudaMalloc((void**)&d_csrRowPtr, (N+1)*sizeof( int )) );
CUDA_CALL( cudaMalloc((void**)&d_eigenvector0, N*sizeof( double )) );
CUDA_CALL( cudaMalloc((void**)&d_eigenvector, N*sizeof(d_eigenvector[0])) );

// copy data to device
CUDA_CALL( cudaMemcpy( d_cooRowIndex, h_cooRowIndex, (size_t)(nnz*sizeof( int )), cudaMemcpyHostToDevice ) );
CUDA_CALL( cudaMemcpy( d_cooColIndex, h_cooColIndex, (size_t)(nnz*sizeof( int )), cudaMemcpyHostToDevice ) );
CUDA_CALL( cudaMemcpy( d_cooVal, h_cooVal, (size_t)(nnz*sizeof( double )), cudaMemcpyHostToDevice ) );
CUDA_CALL( cudaMemcpy( d_eigenvector0, h_eigenvector0, (size_t)(N*sizeof( double )), cudaMemcpyHostToDevice ) );

// initialize cusparse and cusolver
CUSOLVER_CALL( cusolverSpCreate( &handleCusolver ) );
CUSPARSE_CALL( cusparseCreate( &handleCusparse ) );

// create and define cusparse matrix descriptor
CUSPARSE_CALL( cusparseCreateMatDescr(&descrA) );
CUSPARSE_CALL( cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL ) );
CUSPARSE_CALL( cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO ) );

// transform from coordinates (COO) values to compressed row pointers (CSR) values
CUSPARSE_CALL( cusparseXcoo2csr( handleCusparse, d_cooRowIndex, nnz, N, d_csrRowPtr, CUSPARSE_INDEX_BASE_ZERO ) );

// define some parameters and call cusolverSpScsreigvsi
int maxite = 1e6;
double tol = 1;
double mu = 0;
statCusolver = cusolverSpDcsreigvsi( handleCusolver, N ,nnz, descrA, d_cooVal, d_csrRowPtr, d_cooColIndex, max_lambda, d_eigenvector0, maxite, tol, &mu, d_eigenvector );
// here statCusolver = CUSOLVER_INTERNAL_ERROR

cudaDeviceSynchronize();
CUDA_CALL( cudaGetLastError() );

// copy from device to host
CUDA_CALL( cudaMemcpy( h_eigenvector0, d_eigenvector, (size_t)(N*sizeof( double )), cudaMemcpyDeviceToHost ) );

// destroy and free stuff
CUSPARSE_CALL( cusparseDestroyMatDescr( descrA ) );
CUSPARSE_CALL( cusparseDestroy( handleCusparse ) );
CUSOLVER_CALL( cusolverSpDestroy( handleCusolver ) );
CUDA_CALL( cudaFree( d_cooRowIndex ) );
CUDA_CALL( cudaFree( d_cooColIndex ) );
CUDA_CALL( cudaFree( d_cooVal ) );
CUDA_CALL( cudaFree( d_csrRowPtr ) );
CUDA_CALL( cudaFree( d_eigenvector0 ) );
CUDA_CALL( cudaFree( d_eigenvector ) );
delete[] h_eigenvector0;
delete[] h_cooRowIndex;
delete[] h_cooColIndex;
delete[] h_cooVal;
}

我已经为初始特征值猜测(即 max_lambda - 或 cuSolver 库教程中的 mu0)、初始特征向量猜测(h_eigenvector0 或 d_eigenvector0)、容差 (tol)、甚至最大迭代量 (maxite) 尝试了不同的选择。

我已经检查过稀疏矩阵是否正确编写(对我来说似乎是正确的)。我还用 Matlab 检查了返回的特征向量,它们完全不同(我认为它们不应该)。

我不知道我还能做什么,但如果有人知道,请告诉我!!

提前致谢。

最佳答案

(在我看来)cuSolver 文档在 mu 参数方面可能不正确。

文档似乎表明这是在主机内存空间中,即倒数第二个参数应该是主机指针。

如果我将其更改为设备指针:

double mu = 0;
double *d_mu;
CUDA_CALL(cudaMalloc(&d_mu, sizeof(double)));
CUDA_CALL(cudaMemset(d_mu, 0, sizeof(double)));
CUSOLVER_CALL(cusolverSpDcsreigvsi( handleCusolver, N ,nnz, descrA, d_cooVal, d_csrRowPtr, d_cooColIndex, max_lambda, d_eigenvector0, maxite, tol, d_mu, d_eigenvector ));

...
CUDA_CALL(cudaMemcpy(&mu, d_mu, sizeof(double), cudaMemcpyDeviceToHost));

我可以让您的代码版本在没有任何 API 错误或 cuda-memcheck 错误的情况下运行。 (我不能保证结果。)

我已经向 NVIDIA 提交了文档查询。如果您可以确认您通过此更改获得了合理的结果,那也可能有用。我查看了生成的特征值和特征向量,虽然它们看起来并不奇怪,但我无法让它们与我在 Octave 中做同样事情的尝试完全相关。

关于c++ - 使用 cuda 7.0 RC 的 cusolver 计算特征值和特征向量,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28918447/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com