cuda - 使用 cuSPARSE 将密集矩阵转换为稀疏 CSR 格式-6ren

cuda - 使用 cuSPARSE 将密集矩阵转换为稀疏 CSR 格式

转载作者：行者123 更新时间：2023-12-02 21:26:18

我想使用 Scsrmv cusparse 函数。

阅读来自 here 的文档，我不知道如何定义 csrRowPtrA 和 csrColIndA:

csrRowPtrA : integer array of m+1 elements that contains the start of every row and the end of the last row plus one.

csrColIndA : integer array of nnz ( = csrRowPtrA(m) - csrRowPtrA(0) ) column indices of the nonzero elements of matrix A.

所以，例如:

  float *devRow;
  cudaMalloc((void **)&devRow, (m+1)*sizeof(float));

如果 A 是矩阵，则:

for (int i=0; i<m; i+= n)   //m is rows , n is columns
    devRow[i] = A[i];

这是每一行的开头。对于最后一行加 1？这让我很困惑。

对于专栏？像这样的东西:

for (int i=0;i<nnz;i++)
   devCol = devRow[m] - devRow[0];

最佳答案

根据 Robert Crovella 的回答，这是一个完整的示例，说明如何将以密集格式存储的稀疏矩阵转换为CSR(压缩行存储)格式。我希望它对其他用户有用。

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cusparse_v2.h>

#include "Utilities.cuh"

/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
    switch (error)
    {

        case CUSPARSE_STATUS_SUCCESS:
            return "CUSPARSE_STATUS_SUCCESS";

        case CUSPARSE_STATUS_NOT_INITIALIZED:
            return "CUSPARSE_STATUS_NOT_INITIALIZED";

        case CUSPARSE_STATUS_ALLOC_FAILED:
            return "CUSPARSE_STATUS_ALLOC_FAILED";

        case CUSPARSE_STATUS_INVALID_VALUE:
            return "CUSPARSE_STATUS_INVALID_VALUE";

        case CUSPARSE_STATUS_ARCH_MISMATCH:
            return "CUSPARSE_STATUS_ARCH_MISMATCH";

        case CUSPARSE_STATUS_MAPPING_ERROR:
            return "CUSPARSE_STATUS_MAPPING_ERROR";

        case CUSPARSE_STATUS_EXECUTION_FAILED:
            return "CUSPARSE_STATUS_EXECUTION_FAILED";

        case CUSPARSE_STATUS_INTERNAL_ERROR:
            return "CUSPARSE_STATUS_INTERNAL_ERROR";

        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";

        case CUSPARSE_STATUS_ZERO_PIVOT:
            return "CUSPARSE_STATUS_ZERO_PIVOT";
    }

    return "<unknown>";
}

inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
    if(CUSPARSE_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUSPARSE error in file '%s', line %Ndims\Nobjs %s\nerror %Ndims: %s\nterminating!\Nobjs",__FILE__, __LINE__,err, \
                                _cusparseGetErrorEnum(err)); \
        cudaDeviceReset(); assert(0); \
    }
}

extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }

/********/
/* MAIN */
/********/
int main()
{
    // --- Initialize cuSPARSE
    cusparseHandle_t handle;    cusparseSafeCall(cusparseCreate(&handle));

    const int Nrows = 4;                        // --- Number of rows
    const int Ncols = 5;                        // --- Number of columns

    // --- Host side dense matrix
    double *h_A_dense = (double*)malloc(Nrows*Ncols*sizeof(*h_A_dense));

    // --- Column-major ordering
    h_A_dense[0] = 1.0f; h_A_dense[4] = 4.0f; h_A_dense[8]  = 0.0f; h_A_dense[12] = 0.0f; h_A_dense[16] = 0.0f;
    h_A_dense[1] = 0.0f; h_A_dense[5] = 2.0f; h_A_dense[9]  = 3.0f; h_A_dense[13] = 0.0f; h_A_dense[17] = 0.0f;
    h_A_dense[2] = 5.0f; h_A_dense[6] = 0.0f; h_A_dense[10] = 0.0f; h_A_dense[14] = 7.0f; h_A_dense[18] = 8.0f;
    h_A_dense[3] = 0.0f; h_A_dense[7] = 0.0f; h_A_dense[11] = 9.0f; h_A_dense[15] = 0.0f; h_A_dense[19] = 6.0f;

    //create device array and copy host to it
    double *d_A_dense;  gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(*d_A_dense)));
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice));

    // --- Descriptor for sparse matrix A
    cusparseMatDescr_t descrA;      cusparseSafeCall(cusparseCreateMatDescr(&descrA));
    cusparseSetMatType      (descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase (descrA, CUSPARSE_INDEX_BASE_ZERO);  

    int nnz = 0;                                // --- Number of nonzero elements in dense matrix
    const int lda = Nrows;                      // --- Leading dimension of dense matrix
    // --- Device side number of nonzero elements per row
    int *d_nnzPerVector;    gpuErrchk(cudaMalloc(&d_nnzPerVector, Nrows * sizeof(*d_nnzPerVector)));
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, &nnz));
    // --- Host side number of nonzero elements per row
    int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(*h_nnzPerVector));
    gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(*h_nnzPerVector), cudaMemcpyDeviceToHost));

    printf("Number of nonzero elements in dense matrix = %i\n\n", nnz);
    for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]);
    printf("\n");

    // --- Device side dense matrix
    double *d_A;            gpuErrchk(cudaMalloc(&d_A, nnz * sizeof(*d_A)));
    int *d_A_RowIndices;    gpuErrchk(cudaMalloc(&d_A_RowIndices, (Nrows + 1) * sizeof(*d_A_RowIndices)));
    int *d_A_ColIndices;    gpuErrchk(cudaMalloc(&d_A_ColIndices, nnz * sizeof(*d_A_ColIndices)));

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector, d_A, d_A_RowIndices, d_A_ColIndices));

    // --- Host side dense matrix
    double *h_A = (double *)malloc(nnz * sizeof(*h_A));     
    int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(*h_A_RowIndices));
    int *h_A_ColIndices = (int *)malloc(nnz * sizeof(*h_A_ColIndices));
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz*sizeof(*h_A), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));

    for (int i = 0; i < nnz; ++i) printf("A[%i] = %.0f ", i, h_A[i]); printf("\n");

    for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");

    for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);   

}

关于cuda - 使用 cuSPARSE 将密集矩阵转换为稀疏 CSR 格式，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/23993014/

文章推荐： zeromq - 0mq/NetMQ PUB/SUB 未传送所有消息

文章推荐： java - 禁用库的日志输出

文章推荐： asp.net-mvc - Html.Dropdownlist 所选值不起作用

python - “密集”对象没有属性 'op'
关闭。这个问题是not reproducible or was caused by typos .它目前不接受答案。想改善这个问题吗？更新问题，使其成为 on-topic对于堆栈溢出。 5 个月前关
opencv - 如何在蒙版图像上进行基于网格(密集)的光流？
我正在尝试使用摄像机跟踪多个人。我不想使用 blob 分割技术。我想做什么: 执行背景减法以获得隔离人们运动的掩码。在这些区域执行基于网格的光流 -我最好的选择是什么？我正在努力实现。我已经尝试过
python - OpenCV-Python 密集 SIFT
OpenCV 有 very good documentation on generating SIFT descriptors ，但这是“弱 SIFT”的一个版本，其中关键点由原始 Lowe algo
c++ - 索引到 CHOLMOD 密集 vector 数组
我有一个 cholmod_dense 数据结构: cholmod_dense* ex = cholmod_l_solve(CHOLMOD_A, L, B, &com); 我想提取这些值并将它们复制到另
python - OpenCV-Python 密集 SIFT 设置
这是先前发布的关于在 python 中使用 OpenCVs 密集筛选实现的问题的后续问题 (OpenCV-Python dense SIFT)。使用建议的代码进行密集筛选 dense=cv2
computer-vision - 密集 SIFT 和 HoG 有什么区别？
我是计算机视觉的新手。我正在学习 Dense SIFT 和 HOG。对于密集 SIFT，算法只是将每个点视为一个有趣的点并计算其梯度向量。 HOG 是另一种用梯度向量描述图像的方法。我认为 Dens
python - 具有对比度阈值的 Opencv 2.4 密集 SIFT
我正在尝试使用 openCV-python 2.4 计算密集 SIFT import cv2 def gen_sift_features(gray, step_size, gamma): de
opencv - 对于词袋，密集 SIFT 是否比 SIFT 更好？
我正在使用 OpenCV 实现词袋图像分类器。最初我测试了在 SURF 关键点中提取的 SURF 描述符。我听说 Dense SIFT(或 PHOW)描述符更适合我的目的，所以我也尝试了它们。令我惊
c++ - 使用 boost 求解(密集)线性系统 Ax=b
我有一个密集的 Ax=b 类型的方程组要在我的 C++ 程序中求解，我希望在 boost 中使用 UBLAS 来实现该解决方案。在其他一些问题中，我发现人们正在使用扩展 LAPACK，但不幸的是，它似
c++ - 什么是 boost 密集 C++/模板编译的良好 CPU/PC 设置？
我目前有一台配备 Opteron 275 (2.2Ghz)(双核 CPU)和 4GB RAM 以及速度非常快的硬盘的机器。我发现即使是使用 C++ 模板(想想 boost 等)编译一些简单的项目时，我

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

cuda - 使用 cuSPARSE 将密集矩阵转换为稀疏 CSR 格式