gpt4 book ai didi

cuda - 用CUDA求解线性系统AX = B

转载 作者:行者123 更新时间:2023-12-04 10:03:49 25 4
gpt4 key购买 nike

我可以使用新的cuSOLVER库(CUDA 7)求解以下形式的线性系统吗

AX = B 
AXB在哪里是 NxN密集矩阵?

最佳答案

是的。

方法。 1

在cuSOLVER框架中,您可以使用QR分解,请参阅QR decomposition to solve linear systems in CUDA

方法。 2

或者,您可以通过连续累加来计算矩阵逆

 cublas<t>getrfBatched() 

计算矩阵的LU分解,以及
cublas<t>getriBatched() 

从LU分解开始计算矩阵的逆。

方法。 3

最终的可能性是使用
cublas<t>getrfBatched() 

其次是双重调用
cublas<t>trsm() 

解决上或下三角线性系统。

正如罗伯特·克罗维拉(Robert Crovella)所指出的,答案可能在所涉及矩阵的大小和类型上有所不同。

方法nr的代码。 1

请参阅 QR decomposition to solve linear systems in CUDA

方法代码nr。 2和nr。 3

下面,我要报告一个实现方法nr的有效示例。 23Hankel matrices用于为条件良好的可逆矩阵提供方法。请注意方法nr。 3需要根据调用 cublas<t>getrfBatched()后获得的数据透视数组对系统系数向量进行置换(重新排列)。可以在CPU上方便地完成此排列。
#include <stdio.h>
#include <fstream>
#include <iomanip>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include "cublas_v2.h"

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define prec_save 10

#define BLOCKSIZE 256

#define BLOCKSIZEX 16
#define BLOCKSIZEY 16

/************************************/
/* SAVE REAL ARRAY FROM CPU TO FILE */
/************************************/
template <class T>
void saveCPUrealtxt(const T * h_in, const char *filename, const int M) {

std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();

}

/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {

T *h_in = (T *)malloc(M * sizeof(T));

gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));

std::ofstream outfile;
outfile.open(filename);
for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
outfile.close();

}

/***************************************************/
/* FUNCTION TO SET THE VALUES OF THE HANKEL MATRIX */
/***************************************************/
// --- https://en.wikipedia.org/wiki/Hankel_matrix
void setHankelMatrix(double * __restrict h_A, const int N) {

double *h_atemp = (double *)malloc((2 * N - 1) * sizeof(double));

// --- Initialize random seed
srand(time(NULL));

// --- Generate random numbers
for (int k = 0; k < 2 * N - 1; k++) h_atemp[k] = rand();

// --- Fill the Hankel matrix. The Hankel matrix is symmetric, so filling by row or column is equivalent.
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
h_A[i * N + j] = h_atemp[(i + 1) + (j + 1) - 2];

free(h_atemp);

}

/***********************************************/
/* FUNCTION TO COMPUTE THE COEFFICIENTS VECTOR */
/***********************************************/
void computeCoefficientsVector(const double * __restrict h_A, const double * __restrict h_xref,
double * __restrict h_y, const int N) {

for (int k = 0; k < N; k++) h_y[k] = 0.f;

for (int m = 0; m < N; m++)
for (int n = 0; n < N; n++)
h_y[m] = h_y[m] + h_A[n * N + m] * h_xref[n];

}

/************************************/
/* COEFFICIENT REARRANGING FUNCTION */
/************************************/
void rearrange(double *vec, int *pivotArray, int N){
for (int i = 0; i < N; i++) {
double temp = vec[i];
vec[i] = vec[pivotArray[i] - 1];
vec[pivotArray[i] - 1] = temp;
}
}

/********/
/* MAIN */
/********/
int main() {

const unsigned int N = 1000;

const unsigned int Nmatrices = 1;

// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasSafeCall(cublasCreate(&cublas_handle));

TimingGPU timerLU, timerApproach1, timerApproach2;
double timingLU, timingApproach1, timingApproach2;

/***********************/
/* SETTING THE PROBLEM */
/***********************/
// --- Matrices to be inverted (only one in this example)
double *h_A = (double *)malloc(N * N * Nmatrices * sizeof(double));

// --- Setting the Hankel matrix
setHankelMatrix(h_A, N);

// --- Defining the solution
double *h_xref = (double *)malloc(N * sizeof(double));
for (int k = 0; k < N; k++) h_xref[k] = 1.f;

// --- Coefficient vectors (only one in this example)
double *h_y = (double *)malloc(N * sizeof(double));

computeCoefficientsVector(h_A, h_xref, h_y, N);

// --- Result (only one in this example)
double *h_x = (double *)malloc(N * sizeof(double));

// --- Allocate device space for the input matrices
double *d_A; gpuErrchk(cudaMalloc(&d_A, N * N * Nmatrices * sizeof(double)));
double *d_y; gpuErrchk(cudaMalloc(&d_y, N * sizeof(double)));
double *d_x; gpuErrchk(cudaMalloc(&d_x, N * sizeof(double)));

// --- Move the relevant matrices from host to device
gpuErrchk(cudaMemcpy(d_A, h_A, N * N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y, h_y, N * sizeof(double), cudaMemcpyHostToDevice));

/**********************************/
/* COMPUTING THE LU DECOMPOSITION */
/**********************************/
timerLU.StartCounter();

// --- Creating the array of pointers needed as input/output to the batched getrf
double **h_inout_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_inout_pointers[i] = d_A + i * N * N;

double **d_inout_pointers;
gpuErrchk(cudaMalloc(&d_inout_pointers, Nmatrices * sizeof(double *)));
gpuErrchk(cudaMemcpy(d_inout_pointers, h_inout_pointers, Nmatrices * sizeof(double *), cudaMemcpyHostToDevice));
free(h_inout_pointers);

int *d_pivotArray; gpuErrchk(cudaMalloc(&d_pivotArray, N * Nmatrices * sizeof(int)));
int *d_InfoArray; gpuErrchk(cudaMalloc(&d_InfoArray, Nmatrices * sizeof(int)));

int *h_InfoArray = (int *)malloc(Nmatrices * sizeof(int));

cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, d_pivotArray, d_InfoArray, Nmatrices));
//cublasSafeCall(cublasDgetrfBatched(cublas_handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));

gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}

timingLU = timerLU.GetCounter();
printf("Timing LU decomposition %f [ms]\n", timingLU);

/*********************************/
/* CHECKING THE LU DECOMPOSITION */
/*********************************/
saveCPUrealtxt(h_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\A.txt", N * N);
saveCPUrealtxt(h_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\y.txt", N);
saveGPUrealtxt(d_A, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\Adecomposed.txt", N * N);
saveGPUrealtxt(d_pivotArray, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\pivotArray.txt", N);

/******************************************************************************/
/* APPROACH NR.1: COMPUTE THE INVERSE OF A STARTING FROM ITS LU DECOMPOSITION */
/******************************************************************************/
timerApproach1.StartCounter();

// --- Allocate device space for the inverted matrices
double *d_Ainv; gpuErrchk(cudaMalloc(&d_Ainv, N * N * Nmatrices * sizeof(double)));

// --- Creating the array of pointers needed as output to the batched getri
double **h_out_pointers = (double **)malloc(Nmatrices * sizeof(double *));
for (int i = 0; i < Nmatrices; i++) h_out_pointers[i] = (double *)((char*)d_Ainv + i * ((size_t)N * N) * sizeof(double));

double **d_out_pointers;
gpuErrchk(cudaMalloc(&d_out_pointers, Nmatrices*sizeof(double *)));
gpuErrchk(cudaMemcpy(d_out_pointers, h_out_pointers, Nmatrices*sizeof(double *), cudaMemcpyHostToDevice));
free(h_out_pointers);

cublasSafeCall(cublasDgetriBatched(cublas_handle, N, (const double **)d_inout_pointers, N, d_pivotArray, d_out_pointers, N, d_InfoArray, Nmatrices));

gpuErrchk(cudaMemcpy(h_InfoArray, d_InfoArray, Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Inversion of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}

double alpha1 = 1.f;
double beta1 = 0.f;

cublasSafeCall(cublasDgemv(cublas_handle, CUBLAS_OP_N, N, N, &alpha1, d_Ainv, N, d_y, 1, &beta1, d_x, 1));

timingApproach1 = timingLU + timerApproach1.GetCounter();
printf("Timing approach 1 %f [ms]\n", timingApproach1);

/**************************/
/* CHECKING APPROACH NR.1 */
/**************************/
saveGPUrealtxt(d_x, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach1.txt", N);

/*************************************************************/
/* APPROACH NR.2: INVERT UPPER AND LOWER TRIANGULAR MATRICES */
/*************************************************************/
timerApproach2.StartCounter();

double *d_P; gpuErrchk(cudaMalloc(&d_P, N * N * sizeof(double)));

gpuErrchk(cudaMemcpy(h_y, d_y, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));
int *h_pivotArray = (int *)malloc(N * Nmatrices*sizeof(int));
gpuErrchk(cudaMemcpy(h_pivotArray, d_pivotArray, N * Nmatrices * sizeof(int), cudaMemcpyDeviceToHost));

rearrange(h_y, h_pivotArray, N);
gpuErrchk(cudaMemcpy(d_y, h_y, N * Nmatrices * sizeof(double), cudaMemcpyHostToDevice));

// --- Now P*A=L*U
// Linear system A*x=y => P.'*L*U*x=y => L*U*x=P*y

// --- 1st phase - solve Ly = b
const double alpha = 1.f;

// --- Function solves the triangular linear system with multiple right hand sides, function overrides b as a result

// --- Lower triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, d_A, N, d_y, N));

// --- Upper triangular part
cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, d_A, N, d_y, N));

timingApproach2 = timingLU + timerApproach2.GetCounter();
printf("Timing approach 2 %f [ms]\n", timingApproach2);

/**************************/
/* CHECKING APPROACH NR.2 */
/**************************/
saveGPUrealtxt(d_y, "D:\\Project\\solveSquareLinearSystemCUDA\\solveSquareLinearSystemCUDA\\xApproach2.txt", N);

return 0;
}

运行此示例所需的 Utilities.cuUtilities.cuh文件在此 github page上维护。 TimingGPU.cuTimingGPU.cuh文件在此 github page中维护。

关于第三种方法的一些有用的引用资料:

NAG Fortran Library Routine Document

Scientific Computing Software Library (SCSL) User’s Guide

https://www.cs.drexel.edu/~jjohnson/2010-11/summer/cs680/programs/lapack/Danh/verify_sequential.c

编辑

进近nr的时间(以毫秒为单位)。 2和3(在GTX960卡上执行的测试,抄送5.2)。
N        LU decomposition       Approach nr. 2       Approach nr. 3
100 1.08 2.75 1.28
500 45.4 161 45.7
1000 302 1053 303

出现时,请接近nr。 3更为方便,其成本本质上是计算LU分解的成本。此外:
  • 通过LU分解求解线性系统比使用QR分解更快(请参阅QR decomposition to solve linear systems in CUDA);
  • LU分解仅限于平方线性系统,而QR分解在非平方线性系统的情况下有帮助。

  • 下面的Matlab代码可用于检查结果
    clear all
    close all
    clc

    warning off

    N = 1000;

    % --- Setting the problem solution
    x = ones(N, 1);

    %%%%%%%%%%%%%%%%%%%%%
    % NxN HANKEL MATRIX %
    %%%%%%%%%%%%%%%%%%%%%
    % --- https://en.wikipedia.org/wiki/Hankel_matrix
    load A.txt
    load y.txt

    A = reshape(A, N, N);

    yMatlab = A * x;
    fprintf('Percentage rms between coefficients vectors in Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(yMatlab - y).^2)) / sum(sum(abs(yMatlab).^2))));

    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    % COMPUTATION OF THE LU DECOMPOSITION %
    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    [Lmatlab, Umatlab] = lu(A);

    load Adecomposed.txt
    Adecomposed = reshape(Adecomposed, N, N);

    L = eye(N);
    for k = 1 : N
    L(k + 1 : N, k) = Adecomposed(k + 1 : N, k);
    end
    U = zeros(N);
    for k = 1 : N
    U(k, k : N) = Adecomposed(k, k : N);
    end

    load pivotArray.txt
    Pj = eye(N);
    for j = 1 : N
    tempVector = Pj(j, :);
    Pj(j, :) = Pj(pivotArray(j), :);
    Pj(pivotArray(j), :) = tempVector;
    end

    fprintf('Percentage rms between Pj * A and L * U in CUDA %f\n', 100 * sqrt(sum(sum(abs(Pj * A - L * U).^2)) / sum(sum(abs(Pj * A).^2))));

    xprime = inv(Lmatlab) * yMatlab;
    xMatlab = inv(Umatlab) * xprime;

    fprintf('Percentage rms between reference solution and solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));

    load xApproach1.txt
    fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.1 %f\n', 100 * sqrt(sum(sum(abs(xApproach1 - x).^2)) / sum(sum(abs(x).^2))));

    load xApproach2.txt
    fprintf('Percentage rms between reference solution and solution in CUDA for approach nr.2 %f\n', 100 * sqrt(sum(sum(abs(xApproach2 - x).^2)) / sum(sum(abs(x).^2))));

    关于cuda - 用CUDA求解线性系统AX = B,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28794010/

    25 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com