gpt4 book ai didi

c++ - CUDA-CUBLAS : issues solving many (3x3) dense linear systems

转载 作者:行者123 更新时间:2023-12-03 07:11:59 26 4
gpt4 key购买 nike

我正在尝试使用 CUDA 10.1 解决大约 1200000 个线性系统(3x3,Ax=B),特别是使用 CUBLAS 库。我从 this (useful!) post 那里得到了提示并在统一内存版本中重新编写了建议的代码。该算法首先使用 cublasgetrfBatched() 执行 LU 分解,然后连续两次调用 cublastrsm() 求解上三角线性系统或下三角线性系统。代码附在下面。它最多可以正确处理大约 10000 个矩阵,在这种情况下,执行 LU 分解需要约 570 毫秒(在 NVIDIA GeForce 930MX 上),求解系统需要约 311 毫秒。
我的问题/问题是:

  • 过载问题:为超过 10k 的矩阵分配内存时会崩溃。为什么?如何改进我的代码以解决整批 120 万个矩阵?
  • 时间问题:我的目标是在 1 秒内解决所有这些系统。我目前是否遵循正确的方法?否则有什么建议吗?
  • 是否有可能和/或有用,如果是的话,如何使用 10k 矩阵批处理的“流”?

  • 代码:
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <assert.h>
    #include <algorithm>
    #include <cmath>
    #include <iostream>
    #include <vector>
    #include <ctime>
    #include <ratio>
    #include <chrono>
    #include <random>
    #include <time.h>
    #include <math.h>

    // CUDA
    #include <cuda.h>
    #include <cuda_runtime.h>
    #include "device_launch_parameters.h"
    #include <cusolverDn.h>

    //#include "Utilities.cuh"

    using namespace std;
    using namespace std::chrono;

    /************************************/
    /* COEFFICIENT REARRANGING FUNCTION */
    /************************************/
    void rearrange(double** vec, int* pivotArray, int N, int numMatrices) {
    for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < N; i++) {
    double temp = vec[nm][i];
    vec[nm][i] = vec[nm][pivotArray[N*i + nm] - 1];
    vec[nm][pivotArray[N * i + nm] - 1] = temp;
    }
    }
    }


    /************************************/
    /* MAIN */
    /************************************/
    int main() {

    const int N = 3;
    const int numMatrices = 10000; // I want 1200000

    // random generator to fill matrices and coefficients
    random_device device;
    mt19937 generator(device());
    uniform_real_distribution<double> distribution(1., 5.);

    //ALLOCATE MEMORY - using unified memory
    double** h_A;
    cudaMallocManaged(&h_A, sizeof(double*) * numMatrices);
    for (int nm = 0; nm < numMatrices; nm++) {
    cudaMallocManaged(&(h_A[nm]), sizeof(double) * N * N);
    }

    double** h_b;
    cudaMallocManaged(&h_b, sizeof(double*) * numMatrices);
    for (int nm = 0; nm < numMatrices; nm++) {
    cudaMallocManaged(&(h_b[nm]), sizeof(double) * N );
    }
    cout << " memory allocated" << endl;

    // FILL MATRICES
    for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
    h_A[nm][j * N + i] = distribution(generator);
    }
    }
    }
    cout << " Matrix filled " << endl;

    // FILL COEFFICIENTS
    for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < N; i++) {
    h_b[nm][i] = distribution(generator);
    }
    }
    cout << " Coeff. vector filled " << endl;
    cout << endl;

    // --- CUDA solver initialization
    cublasHandle_t cublas_handle;
    cublasCreate_v2(&cublas_handle);
    int* PivotArray;
    cudaMallocManaged(&PivotArray, N * numMatrices * sizeof(int));
    int* infoArray;
    cudaMallocManaged(&infoArray, numMatrices * sizeof(int));

    //CUBLAS LU SOLVER
    high_resolution_clock::time_point t1 = high_resolution_clock::now();
    cublasDgetrfBatched(cublas_handle, N, h_A, N, PivotArray, infoArray, numMatrices);
    cudaDeviceSynchronize();
    high_resolution_clock::time_point t2 = high_resolution_clock::now();
    duration<double> time_span = duration_cast<duration<double>>(t2 - t1);
    cout << "It took " << time_span.count() * 1000. << " milliseconds." << endl;


    for (int i = 0; i < numMatrices; i++)
    if (infoArray[i] != 0) {
    fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
    }

    // rearrange coefficient
    // (temporarily on CPU, this step will be on a GPU Kernel as well)
    high_resolution_clock::time_point tA = high_resolution_clock::now();
    rearrange(h_b, PivotArray, N, numMatrices);
    high_resolution_clock::time_point tB = high_resolution_clock::now();
    duration<double> time_spanA = duration_cast<duration<double>>(tB - tA);
    cout << "rearrangement took " << time_spanA.count() * 1000. << " milliseconds." << endl;

    //INVERT UPPER AND LOWER TRIANGULAR MATRICES
    // --- Function solves the triangular linear system with multiple right-hand sides
    // --- Function overrides b as a result
    const double alpha = 1.f;
    high_resolution_clock::time_point t3 = high_resolution_clock::now();
    cublasDtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT, N, 1, &alpha, h_A, N, h_b, N, numMatrices);
    cublasDtrsmBatched(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, N, 1, &alpha, h_A, N, h_b, N, numMatrices);
    cudaDeviceSynchronize();
    high_resolution_clock::time_point t4 = high_resolution_clock::now();
    duration<double> time_span2 = duration_cast<duration<double>>(t4 - t3);
    cout << "second step took " << time_span2.count() * 1000. << " milliseconds." << endl;

    // --- Free resources
    if (h_A) cudaFree(h_A);
    if (h_b) cudaFree(h_b);

    cudaDeviceReset();

    return 0;
    }

    最佳答案

    Overload issue: it crashes allocating memory for more than 10k matrices. Why? How can I improve my code in order to solve the whole batch of 1.2 million matrices?


    在我看来,您的代码中最大的问题是您在这些关键分配循环中对托管内存的使用效率极低:
      //ALLOCATE MEMORY - using unified memory
    double** h_A;
    cudaMallocManaged(&h_A, sizeof(double*) * numMatrices);
    for (int nm = 0; nm < numMatrices; nm++) {
    cudaMallocManaged(&(h_A[nm]), sizeof(double) * N * N);
    }

    double** h_b;
    cudaMallocManaged(&h_b, sizeof(double*) * numMatrices);
    for (int nm = 0; nm < numMatrices; nm++) {
    cudaMallocManaged(&(h_b[nm]), sizeof(double) * N );
    }
    问题是每次调用 cudaMallocManaged具有最小粒度。这意味着如果你请求分配 1 个字节,它实际上会占用大约 4kbyte 的内存(我相信这是 linux 分配粒度。看起来你在 windows 上,我相信 windows 分配粒度可能更大) .此外,当您启动内核时,这会在托管内存子系统上造成巨大的低效数据传输负载(内核将在您的 cublas 调用中启动)。
    一个更好的方法是执行一次大分配,而不是循环分配,然后使用指针算法 segmentation 该分配。代码可能如下所示:
      //ALLOCATE MEMORY - using unified memory
    double** h_A;
    cudaMallocManaged(&h_A, sizeof(double*) * numMatrices);
    cudaMallocManaged(&(h_A[0]), sizeof(double)*numMatrices*N*N);
    for (int nm = 1; nm < numMatrices; nm++) {
    h_A[nm] = h_A[nm-1]+ N * N;
    }

    double** h_b;
    cudaMallocManaged(&h_b, sizeof(double*) * numMatrices);
    cudaMallocManaged(&(h_b[0]), sizeof(double) * numMatrices * N);
    for (int nm = 1; nm < numMatrices; nm++) {
    h_b[nm] = h_b[nm-1] + N;
    }
    这样做的另一个好处是分配过程运行得相当快。

    Time issue: my goal would be to solve all of these systems in less than 1 second. Am I currently following the correct approach? Any suggestions otherwise?


    通过对代码的更改,我能够在 1GB GPU (GeForce GT640) 上成功运行,其中:
    const int numMatrices = 1200000;
    输出如下:
    $ ./t81
    memory allocated
    Matrix filled
    Coeff. vector filled

    It took 70.3032 milliseconds.
    rearrangement took 60.02 milliseconds.
    second step took 156.067 milliseconds.
    你的 GPU 可能有点慢,但我认为整体时间应该很容易在不到 1 秒的时间内进入。

    Would it be possible and/or useful, and if yes how, to use 'streams' of batches of 10k matrices?


    通过上述更改,我认为您无需担心这一点。流在这里对计算操作的重叠没有帮助。它们可以帮助复制/计算重叠(尽管在您的 GPU 上可能不多),但这很难在具有托管内存的 Windows 上构建。对于windows使用,我可能会建议切换到主机和设备内存的普通CUDA分离,如果你想探索 copy/compute overlap .
    顺便说一句,您可能会得到一组 cublas 调用,通过使用直接反转可以更快地完成工作。 CUBLAS 有批量直接反演方法。我通常不会建议将此用于线性方程组的解,但对于一组 3x3 或 4x4 反演可能需要考虑,您可以使用行列式方法轻松检查奇异性。 Here是一个例子。

    关于c++ - CUDA-CUBLAS : issues solving many (3x3) dense linear systems,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/64664431/

    26 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com