gpt4 book ai didi

cuda - CUBLAS:零主元矩阵的不正确反演

转载 作者:行者123 更新时间:2023-12-01 04:37:06 25 4
gpt4 key购买 nike

从 CUDA 5.5 开始,CUBLAS 库包含用于批量矩阵分解和求逆的例程(分别为 cublas<t>getrfBatched cublas<t>getriBatched )。

从文档中获取指南,我编写了一个使用这些例程反转 N x N 矩阵的测试代码。仅当矩阵具有所有非零主元时,代码才会给出正确的输出。将任何枢轴设置为零会导致不正确的结果。我已经使用 MATLAB 验证了结果。

我意识到我提供行主矩阵作为输入,而 CUBLAS 需要列主矩阵,但这应该无关紧要,因为它只会转置结果。可以肯定的是,我还对列主要输入进行了测试,但得到了相同的行为。

我很困惑,cublas<t>getriBatched期望枢轴交换信息数组 P作为输入,这是来自 cublas<t>getrfBatched 的输出.因此,如果行交换消除了任何零枢轴,则反转例程应自动处理它。

如何使用 CUBLAS 执行包含零主元的矩阵求逆?

以下是具有不同测试用例的自包含可编译示例:

#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)

#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)


void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));

int batchSize = 1;

int *P, *INFO;

cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));

int lda = n;

float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));

cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));

int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));

if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}

float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));

cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize));

cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));

if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}

cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}

void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;

cudacall(cudaMalloc<float>(&src_d,n * n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));

invert_device(src_d,dst_d,n);

cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));

cudaFree(src_d), cudaFree(dst_d);
}

void test_invert()
{
const int n = 3;

//Random matrix with full pivots
float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };

//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };

float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };

float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };

float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };

float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};


//Select matrix by setting "a"
float* a = zero_pivot;

fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}

fprintf(stdout,"\n\n");

invert(a,a,n);

fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}

}

int main()
{
test_invert();

int n; scanf("%d",&n);
return 0;
}

最佳答案

cublas<t>getrfBatched 的当前 CUBLAS 库实现中似乎存在错误对于维度矩阵 ( n ) 使得 3<=n<=16 ,当你说有一个“零支点”时。

一种可能的解决方法是“身份扩展”您的 A当 n<17 时,要反转的矩阵大小为 17x17(使用 matlab 命名法):

 LU = getrf( [A 0 ; 0 I]);

继续,然后您可以使用 cublas<t>getriBatched以“普通”方式:
 invA = getri( LU(1:3,1:3) )

(您也可以将所有内容保留在 n=17,以这种方式调用 getri,然后将结果提取为 invA 的前 3x3 行和列。)

这是一个完整的示例,借鉴了您提供的代码,显示了您提供的 3x3 zero_pivot 的反转。矩阵,使用 zero_pivot_war矩阵作为“身份扩展”的解决方法:
$ cat t340.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)

#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)


void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));

int batchSize = 1;

int *P, *INFO;

cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));

int lda = 17;

float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));

cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));

int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));

if(INFOh == 17)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}

float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));

cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));

cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));

if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}

cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}

void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;

cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));

invert_device(src_d,dst_d,n);

cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));

cudaFree(src_d), cudaFree(dst_d);
}

void test_invert()
{
const int n = 3;

//Random matrix with full pivots
/* float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };

//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };

float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
*/
float zero_pivot_war[17*17] = {
0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
/*
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };

float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };

float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
*/
float result[n*n];

//Select matrix by setting "a"
float* a = zero_pivot_war;

fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*17+j]);
fprintf(stdout,"\n");
}

fprintf(stdout,"\n\n");

invert(a,result,n);

fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",result[i*n+j]);
fprintf(stdout,"\n");
}

}

int main()
{
test_invert();

// int n; scanf("%d",&n);
return 0;
}
$ nvcc -arch=sm_20 -o t340 t340.cu -lcublas
$ cuda-memcheck ./t340
========= CUDA-MEMCHECK
Input:

0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000


Inverse:

-0.700000 -0.200000 0.300000
0.400000 -0.266667 0.066667
-0.050000 0.200000 -0.050000
========= ERROR SUMMARY: 0 errors
$

以上结果在我看来是正确的,基于一个简单的测试 elsewhere .

我没有关于 CUBLAS 中可能存在的错误的性质的任何进一步的技术细节。据我所知,它存在于 CUDA 5.5 和 CUDA 6.0 RC 中。有关 NVIDIA 提供的 Assets (例如 CUBLAS 库)的详细错误讨论应在 the NVIDIA developer forums 上进行。或直接在 developer.nvidia.com 上的错误归档门户(您必须是注册开发人员才能提交错误)。

关于cuda - CUBLAS:零主元矩阵的不正确反演,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/22887167/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com