cuda - 用CUDA求解线性系统AX = B-6ren

cuda - 用CUDA求解线性系统AX = B

转载作者：行者123 更新时间：2023-12-04 10:03:49

我可以使用新的cuSOLVER库(CUDA 7)求解以下形式的线性系统吗

AX = B

A， X和 B在哪里是 NxN密集矩阵？

最佳答案

是的。

方法。 1

在cuSOLVER框架中，您可以使用QR分解，请参阅QR decomposition to solve linear systems in CUDA。

方法。 2

或者，您可以通过连续累加来计算矩阵逆

 cublas<t>getrfBatched()

计算矩阵的LU分解，以及

cublas<t>getriBatched()

从LU分解开始计算矩阵的逆。

方法。 3

最终的可能性是使用

cublas<t>getrfBatched()

其次是双重调用

cublas<t>trsm()

解决上或下三角线性系统。

正如罗伯特·克罗维拉(Robert Crovella)所指出的，答案可能在所涉及矩阵的大小和类型上有所不同。

方法nr的代码。 1

请参阅 QR decomposition to solve linear systems in CUDA。

方法代码nr。 2和nr。 3

下面，我要报告一个实现方法nr的有效示例。 2和 3。 Hankel matrices用于为条件良好的可逆矩阵提供方法。请注意方法nr。 3需要根据调用 cublas<t>getrfBatched()后获得的数据透视数组对系统系数向量进行置换(重新排列)。可以在CPU上方便地完成此排列。

#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h>     /* srand, rand */
#include <time.h>       /* time */

#include "cuda_runtime.h" 
#include "device_launch_parameters.h"

#include "cublas_v2.h"

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define prec_save 10

#define BLOCKSIZE 256

#define BLOCKSIZEX 16
#define BLOCKSIZEY 16

/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {

    T *h_in = (T *)malloc(M * sizeof(T));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {

    double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));

    // --- Initialize random seed
    srand(time(NULL));

    // --- Generate random numbers
    for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();

    // --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++)
            h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];

    free(h_atemp);

}

/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref, 
                               double * __restrict h_y, const int N) {

    for (int k = 0; k < N; k++) h_y[k] = 0.f;

    for (int m = 0; m < N; m++)
        for (int n = 0; n < N; n++)
            h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];

}

/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
    for (int i = 0; i < N; i++) {
        double temp = vec[i];
        vec[i] = vec[pivotArray[i] - 1];
        vec[pivotArray[i] - 1] = temp;
    }   
}

/********/
/* MAIN */
/********/
int main() {

    const unsigned int N = 1000;

    const unsigned int Nmatrices = 1;

    // --- CUBLAS initialization
    cublasHandle_t cublas_handle;
    cublasSafeCall(cublasCreate(&cublas_handle));

    TimingGPU timerLU, timerApproach1, timerApproach2;
    double timingLU, timingApproach1, timingApproach2;

    /***********************/
    /* SETTING THE PROBLEM */
    /***********************/
    // --- Matrices to be inverted (only one in this example)
    double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));

    // --- Setting the Hankel matrix
    setHankelMatrix(h_A, N);

    // --- Defining the solution
    double *h_xref = (double *)malloc(N * sizeof(double));
    for (int k = 0; k < N; k++) h_xref[k] = 1.f;

    // --- Coefficient vectors (only one in this example)
    double *h_y = (double *)malloc(N * sizeof(double));

    computeCoefficientsVector(h_A, h_xref, h_y, N);

    // --- Result (only one in this example)
    double *h_x = (double *)malloc(N * sizeof(double));

    // --- Allocate device space for the input matrices 
    double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
    double *d_y; gpuErrchk(cudaMalloc(&d_y, N *                 sizeof(double)));
    double *d_x; gpuErrchk(cudaMalloc(&d_x, N *                 sizeof(double)));

    // --- Move the relevant matrices from host to device
    gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_y, h_y, N *                 sizeof(double), cudaMemcpyHostToDevice));

    /**********************************/
    /* COMPUTING THE LU DECOMPOSITION */
    /**********************************/
    timerLU.StartCounter();

    // --- Creating the array of pointers needed as input/output to the batched getrf
    double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
    for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;

    double **d_inout_pointers;
    gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
    gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
    free(h_inout_pointers);

    int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
    int *d_InfoArray;  gpuErrchk(cudaMalloc(&d_InfoArray,      Nmatrices * sizeof(int)));

    int *h_InfoArray  = (int *)malloc(Nmatrices * sizeof(int));

    cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
    //cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i] != 0) {
            fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }

    timingLU = timerLU.GetCounter();
    printf("Timing LU decomposition %f [ms]\n", timingLU);

    /*********************************/
    /* CHECKING THE LU DECOMPOSITION */
    /*********************************/
    saveCPUrealtxt(h_A,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
    saveCPUrealtxt(h_y,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
    saveGPUrealtxt(d_A,          "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
    saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);

    /******************************************************************************/
    /* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
    /******************************************************************************/
    timerApproach1.StartCounter();

    // --- Allocate device space for the inverted matrices 
    double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));

    // --- Creating the array of pointers needed as output to the batched getri
    double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
    for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));

    double **d_out_pointers;
    gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
    gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
    free(h_out_pointers);

    cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));

    gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i] != 0) {
        fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
        cudaDeviceReset();
        exit(EXIT_FAILURE);
        }

    double alpha1 = 1.f;
    double beta1 = 0.f;

    cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));

    timingApproach1 = timingLU + timerApproach1.GetCounter();
    printf("Timing approach 1 %f [ms]\n", timingApproach1);

    /**************************/
    /* CHECKING APPROACH NR.1 */
    /**************************/
    saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);

    /*************************************************************/
    /* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
    /*************************************************************/
    timerApproach2.StartCounter();

    double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));

    gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
    int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
    gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

    rearrange(h_y, h_pivotArray, N);
    gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));

    // --- Now P*A=L*U
    //     Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y

    // --- 1st phase - solve Ly = b 
    const double alpha = 1.f;

    // --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result 

    // --- Lower triangular part
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));

    // --- Upper triangular part
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));

    timingApproach2 = timingLU + timerApproach2.GetCounter();
    printf("Timing approach 2 %f [ms]\n", timingApproach2);

    /**************************/
    /* CHECKING APPROACH NR.2 */
    /**************************/
    saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);

    return 0;
}

运行此示例所需的 Utilities.cu和 Utilities.cuh文件在此 github page上维护。 TimingGPU.cu和 TimingGPU.cuh文件在此 github page中维护。

关于第三种方法的一些有用的引用资料:

NAG Fortran Library Routine Document

Scientific Computing Software Library (SCSL) User’s Guide

https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c

编辑

进近nr的时间(以毫秒为单位)。 2和3(在GTX960卡上执行的测试，抄送5.2)。

N        LU decomposition       Approach nr. 2       Approach nr. 3
100      1.08                   2.75                 1.28
500      45.4                   161                  45.7
1000     302                    1053                 303

出现时，请接近nr。 3更为方便，其成本本质上是计算LU分解的成本。此外:

通过LU分解求解线性系统比使用QR分解更快(请参阅QR decomposition to solve linear systems in CUDA)；

LU分解仅限于平方线性系统，而QR分解在非平方线性系统的情况下有帮助。

下面的Matlab代码可用于检查结果

clear all
close all
clc

warning off

N = 1000;

% --- Setting the problem solution
x = ones(N, 1);

%%%%%%%%%%%%%%%%%%%%%
% NxN HANKEL MATRIX %
%%%%%%%%%%%%%%%%%%%%%
% --- https://en.wikipedia.org/wiki/Hankel_matrix
load A.txt
load y.txt

A = reshape(A, N, N);

yMatlab = A * x;
fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% COMPUTATION OF THE LU DECOMPOSITION %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[Lmatlab, Umatlab] = lu(A);

load Adecomposed.txt
Adecomposed = reshape(Adecomposed, N, N);

L = eye(N);
for k = 1 : N
    L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
end
U = zeros(N);
for k = 1 : N
    U(k, k : N) = Adecomposed(k, k : N);
end

load pivotArray.txt
Pj = eye(N);
for j = 1 : N
   tempVector = Pj(j, :);
   Pj(j, :) = Pj(pivotArray(j), :);
   Pj(pivotArray(j), :) = tempVector;
end

fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));

xprime      = inv(Lmatlab) * yMatlab;
xMatlab     = inv(Umatlab) * xprime;

fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));

load xApproach1.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));

load xApproach2.txt
fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

关于cuda - 用CUDA求解线性系统AX = B，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/28794010/

文章推荐： angularjs:用于检查用户名是否存在的自定义指令

文章推荐： maven - mvn 依赖 :tree fails on trivial project

文章推荐：如果字段存在，则Django过滤查询集

r - 求解 R 中混合模型方程的积分
我正在使用混合效应模型，并且由于我的方法的特殊性我需要解决下面模型的积分，然后制作图表获得的估计值。换句话说，我需要求解下面的积分: 其中，di^2 是我模型中的 Var3，dh 是混合效应模型对应
r - 求解 R 中的不定方程组
我有一个方程组，我想用数值方法求解它。给定起始种子，我想得到一个接近的解决方案。让我解释。我有一个常量向量，X，值: X <- (c(1,-2,3,4)) 和一个向量 W 的权重: W <- (c(
r - 求解 R 中的非线性方程组
假设我有以下方程组: a * b = 5 sqrt(a * b^2) = 10 如何求解 R 中 a 和 b 的这些方程？我想这个问题可以说是一个优化问题，具有以下功能......？ fn <- f
r - 求解 R 中的微分方程组
我在 R 中有一个简单的通量模型。它归结为两个微分方程，对模型中的两个状态变量进行建模，我们将它们称为 A和 B .它们被计算为四个分量通量的简单差分方程 flux1-flux4 , 5 个参数 p1
r - 求解 R 中函数的逆
R有什么办法吗？求解给定单变量函数的反函数？动机是我以后告诉R使用值向量作为反函数的输入，以便它可以吐出反函数值。例如，我有函数 y(x) = x^2 ，逆是 y = sqrt(x) .有没有办法R
excel - 求解 x 的方程
我在字符串中有以下方程 y = 18774x + 82795 求解x我会这样做:- x = (y-82795) / 18774 我知道y的值但是方程一直在变化，并且始终采用字符串格式是否可以简单地
haskell - 求解(两个以上)线性不等式系统
如果我用 diophantine(2*x+3*y-5*z-77) 我收到了这个结果。 {(t_0, -9*t_0 - 5*t_1 + 154, -5*t_0 - 3*t_1 + 77)} 到目前为止还
r - 求解 ODE - 只有正解
我正在尝试求解仅限于正解的 ODE，即: dx/dt=f(x) x>=0。在 MATLAB 中这很容易实现。 R 是否有任何变通方法或包来将解决方案空间限制为仅正值？这对我来说非常重要，不幸的是没
antlr - 求解 ANTLR 相互左递归规则
下面的 ANTLR 文法中的 'expr' 规则显然是相互左递归的。作为一个 ANTLR 新手，我很难解决这个问题。我已经阅读了 ANTLR 引用书中的“解决非 LL(*) 冲突”，但我仍然没有看到解
r - 求解 R 中的简单方程
我有一个关于在 R 中求解函数的可能性的非常基本的问题，但知道答案确实有助于更好地理解 R。我有以下等式: 0=-100/(1+r)+(100-50)/(1+r)^2+(100-50)/(1+r)^
java - 求解 N 个皇后时出现数组索引越界异常
我正在编写使用递归回溯来解决 8 个皇后问题的代码(将 n 个国际象棋皇后放在 n × n 的棋盘上，这样皇后就不会互相攻击)。我的任务是创建两个方法:编写一个公共(public)solveQuee
r - 求解 R 中的线性方程组
我不知道在以下情况下如何进行，因为最后一个方程没有所有 4 个变量。所以使用了等式下面的代码，但这是错误的......有谁知道如何进行？方程: 3a + 4b - 5c + d = 10 2a +
math - 求解 AVL 树中节点数的递归关系？
假设我们有这个递归关系，它出现在 AVL 树的分析中: F1 = 1 F2 = 2 Fn = Fn - 1 + Fn - 2 + 1(其中 n ≥ 3) 你将如何解决这个递归以获得 F(n) 的封闭形
c - 求解 Maple 中的变量
在Maple中，有谁知道是否存在一个函数来求解变量？例如，我正在尝试求解 r 的 solve4r=(M-x^y)*(r^(-1)) mod (p-1)。所以我知道 M、x、y 和 p 的值，但不知道
c# - 求解 WAV 文件中的振幅和频率
我也问过这个here在声音设计论坛上，但问题是沉重的计算机科学/数学，所以它实际上可能属于这个论坛: 因此，通过读取文件中的二进制文件，我能够成功地找到关于 WAV 文件的所有信息，除了 big si
java - 求解 boolean 表达式时如何思考？
我有以下问题: 设 a 和 b 为 boolean 变量。是否可以设置 a 和 b 的值以使以下表达式的计算结果为 false？ b or (((not a) or (not a)) or (a or
c - 求解 C 中的超越方程
我需要用 C 求解这个超越方程: x = 2.0 - 0.5sen(x) 我试过这个: double x, newx, delta; x = 2.0 - 0.5; newx = sin(x); del
c++ - OpenCV 求解 PnPRansac
我在 Windows 上使用 OpenCV 3.1。一段代码: RNG rng; // random number generator cv::Mat rVec = (cv::Mat_(3, 1)
python - 求解 3 个变量的隐式二次系统
我正在尝试求解一个包含 3 个变量和数量可变的方程的方程组。基本上，系统的长度在 5 到 12 个方程之间，无论有多少个方程，我都试图求解 3 个变量。看起来像这样: (x-A)**2 + (y-
algorithm - 求解 ODE 算法的有限差分法
我正在尝试为有限差分法设计一种算法，但我有点困惑。所讨论的 ODE 是 y''-5y'+10y = 10x，其中 y(0)=0 且 y(1)=100。所以我需要一种方法来以某种方式获得将从关系中乘以“

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

cuda - 用CUDA求解线性系统AX = B