cuda - 使用 cuSPARSE 在 CUDA 中进行稀疏矩阵矩阵乘法-6ren

cuda - 使用 cuSPARSE 在 CUDA 中进行稀疏矩阵矩阵乘法

转载作者：行者123 更新时间：2023-11-30 23:53:22

我正在使用 cuSPARSE 库在 Nvidia K40 上对稀疏矩阵-矩阵乘法进行基准测试。我正在创建自己的 CSR 格式的稀疏矩阵，并且我正在使用 cuSPARSE 库的 cusparseXcsrgemmNnz 例程。但是，随着我增加数据大小，调用 cusparseXcsrgemmNnz 时发生错误，即未返回 CUSPARSE_STATUS_SUCCESS。此外，cudaMemcpy 失败并且未返回 CUDA_SUCCESS。该代码适用于 8 x 8 和 16 x 16 矩阵。但是，从 32 x 32 开始，会观察到此错误。

编辑:我从 cusparseXcsrgemmNnz 收到第三个矩阵大小的 CUSPARSE_STATUS_EXECUTION_FAILED。对于前两个矩阵大小，执行是正确的。

#include <cusparse_v2.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>

// error check macros
#define CUSPARSE_CHECK(x) {cusparseStatus_t _c=x; if (_c != CUSPARSE_STATUS_SUCCESS) {printf("cusparse fail: %d, line: %d\n", (int)_c, __LINE__); exit(-1);}}

#define cudaCheckErrors(msg) \
do { \
    cudaError_t __err = cudaGetLastError(); \
    if (__err != cudaSuccess) { \
        fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
            msg, cudaGetErrorString(__err), \
            __FILE__, __LINE__); \
        fprintf(stderr, "*** FAILED - ABORTING\n"); \
        exit(1); \
    } \
} while (0)


double timerval()
{
    struct timeval st;
    gettimeofday(&st, NULL);
    return (st.tv_sec+st.tv_usec*1e-6);
}

// perform sparse-matrix multiplication C=AxB
int main(){ 
double avg_time = 0, s_time, e_time;

cusparseStatus_t stat;
cusparseHandle_t hndl;
cusparseMatDescr_t descrA, descrB, descrC;
int *csrRowPtrA, *csrRowPtrB, *csrRowPtrC, *csrColIndA, *csrColIndB, *csrColIndC;
int *h_csrRowPtrA, *h_csrRowPtrB, *h_csrRowPtrC, *h_csrColIndA, *h_csrColIndB, *h_csrColIndC,*pos;
float *csrValA, *csrValB, *csrValC, *h_csrValA, *h_csrValB, *h_csrValC;
int nnzA, nnzB, nnzC;
int m=4,n,k,loop;
int i,j;
int iterations;
for (iterations=0;iterations<10;iterations++)
{
    m *=2;
    n = m;
    k = m;
    //density of the sparse matrix to be created. Assume 5% density.
    double dense_const = 0.05;
    int temp5, temp6,temp3,temp4;
    int density=(m*n)*(dense_const);
    nnzA = density;
    nnzB = density;
    h_csrRowPtrA = (int *)malloc((m+1)*sizeof(int));
    h_csrRowPtrB = (int *)malloc((n+1)*sizeof(int));
    h_csrColIndA = (int *)malloc(density*sizeof(int));
    h_csrColIndB = (int *)malloc(density*sizeof(int));
    h_csrValA  = (float *)malloc(density*sizeof(float));
    h_csrValB  = (float *)malloc(density*sizeof(float));
    if ((h_csrRowPtrA == NULL) || (h_csrRowPtrB == NULL) || (h_csrColIndA == NULL) || (h_csrColIndB == NULL) || (h_csrValA == NULL) || (h_csrValB == NULL))
    {printf("malloc fail\n"); return -1;}

    //position array for random initialisation of positions in input matrix
    pos= (int *)calloc((m*n), sizeof(int));
    int temp,temp1;

    //  printf("the density is %d\n",density);
    //  printf("check 1:\n");

    //randomly initialise positions
    for(i=0;i<density;i++)
    {
        temp1=rand()%(m*n);
        pos[i]=temp1;

    }
    //  printf("check 2:\n");

    //sort the 'pos' array
    for (i = 0 ; i < density; i++) {
        int d = i;
        int t;

        while ( d > 0 && pos[d] < pos[d-1]) {
            t          = pos[d];
            pos[d]   = pos[d-1];
            pos[d-1] = t;        
            d--;
        }
    }
    // initialise with non zero elements and extract column and row ptr vector
    j=1;
    //ja[0]=1;

    int p=0;
    int f=0;

    for(i = 0; i < density; i++)
    {
        temp=pos[i];
         h_csrValA[f] = rand();
         h_csrValB[f] = rand();
         h_csrColIndA[f] = temp%m;
         h_csrColIndB[f] = temp%m;
        f++;
        p++;
        temp5= pos[i];
        temp6=pos[i+1];
        temp3=temp5-(temp5%m);
        temp4=temp6-(temp6%m);

        if(!(temp3== temp4))
        {   
            if((temp3+m==temp6))
            {}
            else    
            {   
                h_csrRowPtrA[j]=p;
                h_csrRowPtrB[j]=p;
                j++;
            }

        }       
    }

    // transfer data to device

    cudaMalloc(&csrRowPtrA, (m+1)*sizeof(int));
    cudaMalloc(&csrRowPtrB, (n+1)*sizeof(int));
    cudaMalloc(&csrColIndA, density*sizeof(int));
    cudaMalloc(&csrColIndB, density*sizeof(int));
    cudaMalloc(&csrValA, density*sizeof(float));
    cudaMalloc(&csrValB, density*sizeof(float));
    cudaCheckErrors("cudaMalloc fail");
    cudaMemcpy(csrRowPtrA, h_csrRowPtrA, (m+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrRowPtrB, h_csrRowPtrB, (n+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrColIndA, h_csrColIndA, density*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrColIndB, h_csrColIndB, density*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(csrValA, h_csrValA, density*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(csrValB, h_csrValB, density*sizeof(float), cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy fail");

    // set cusparse matrix types
    CUSPARSE_CHECK(cusparseCreate(&hndl));
    stat = cusparseCreateMatDescr(&descrA);
    CUSPARSE_CHECK(stat);
    stat = cusparseCreateMatDescr(&descrB);
    CUSPARSE_CHECK(stat);
    stat = cusparseCreateMatDescr(&descrC);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatType(descrB, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatIndexBase(descrB, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(stat);
    stat = cusparseSetMatIndexBase(descrC, CUSPARSE_INDEX_BASE_ZERO);
    CUSPARSE_CHECK(stat);
    cusparseOperation_t transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
    cusparseOperation_t transB = CUSPARSE_OPERATION_NON_TRANSPOSE;

    // figure out size of C
    int baseC;
    // nnzTotalDevHostPtr points to host memory
    int *nnzTotalDevHostPtr = &nnzC;
    stat = cusparseSetPointerMode(hndl, CUSPARSE_POINTER_MODE_HOST);
    CUSPARSE_CHECK(stat);
    cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(m+1));
    cudaCheckErrors("cudaMalloc fail");

    s_time=timerval();

    stat = cusparseXcsrgemmNnz(hndl, transA, transB, m, n, k,
    descrA, nnzA, csrRowPtrA, csrColIndA,
    descrB, nnzB, csrRowPtrB, csrColIndB,
    descrC, csrRowPtrC, nnzTotalDevHostPtr );
    CUSPARSE_CHECK(stat);
    if (NULL != nnzTotalDevHostPtr){
    nnzC = *nnzTotalDevHostPtr;}
    else{
    cudaMemcpy(&nnzC, csrRowPtrC+m, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&baseC, csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail");
    nnzC -= baseC;}
    cudaMalloc((void**)&csrColIndC, sizeof(int)*nnzC);
    cudaMalloc((void**)&csrValC, sizeof(float)*nnzC);
    cudaCheckErrors("cudaMalloc fail");
    // perform multiplication C = A*B

    for(loop=0;loop<1000;loop++)
    {
        stat = cusparseScsrgemm(hndl, transA, transB, m, n, k,
        descrA, nnzA,
        csrValA, csrRowPtrA, csrColIndA,
        descrB, nnzB,
        csrValB, csrRowPtrB, csrColIndB,
        descrC,
        csrValC, csrRowPtrC, csrColIndC);
        CUSPARSE_CHECK(stat);
    }

    e_time=timerval();

    avg_time=avg_time/1000;
    // copy result (C) back to host
    h_csrRowPtrC = (int *)malloc((m+1)*sizeof(int));
    h_csrColIndC = (int *)malloc(nnzC *sizeof(int));
    h_csrValC  = (float *)malloc(nnzC *sizeof(float));
    if ((h_csrRowPtrC == NULL) || (h_csrColIndC == NULL) || (h_csrValC == NULL))
    {printf("malloc fail\n"); return -1;}
    cudaMemcpy(h_csrRowPtrC, csrRowPtrC, (m+1)*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_csrColIndC, csrColIndC,  nnzC*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(h_csrValC, csrValC, nnzC*sizeof(float), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail");

    printf ("\n Input size: %d x %d ,Time: %lf and density is %d \n", m,n, avg_time, density); 

    cudaFree(csrRowPtrC);
    cudaFree(csrColIndC);
    cudaFree(csrValC);

    cudaFree(csrRowPtrA);
    cudaFree(csrColIndA);
    cudaFree(csrValA);

    cudaFree(csrRowPtrB);
    cudaFree(csrColIndB);
    cudaFree(csrValB);

    free(h_csrRowPtrC);
    free(h_csrColIndC);
    free(h_csrValC);

    free(h_csrRowPtrA);
    free(h_csrColIndA);
    free(h_csrValA);

    free(h_csrRowPtrB);
    free(h_csrColIndB);
    free(h_csrValB);
}
return 0;

最佳答案

您似乎从 here 中提取了部分代码

如该帖子所示:

a failure in cusparseXcsrgemmNnz could indicate an underlying problem in CSR matrix formatting.

我很确定这就是问题所在。生成格式正确的 CSR matrix 的过程坏了。

为证明这一点，请在您发布的代码中指定注释之前添加以下代码:

printf("A RowPtrs: \n");
for (int i = 0; i < m+1; i++) printf("%d ", h_csrRowPtrA[i]);
printf("\nA ColInds: \n");
for (int i = 0; i < nnzA; i++) printf("%d ", h_csrColIndA[i]);
printf("\nB RowPtrs: \n");
for (int i = 0; i < n+1; i++) printf("%d ", h_csrRowPtrB[i]);
printf("\nB ColInds: \n");
for (int i = 0; i < nnzB; i++) printf("%d ", h_csrColIndB[i]);
printf("\n");

    // add the above code before this comment:
    // transfer data to device

当我这样做、重新编译并运行时，我得到如下所示的输出:

$ ./t730
A RowPtrs:
0 1 2 3 0 0 0 0 0
A ColInds:
6 7 1
B RowPtrs:
0 1 2 3 0 0 0 0 0
B ColInds:
6 7 1

 Input size: 8 x 8 ,Time: 0.000000 and density is 3
A RowPtrs:
0 1 2 3 4 5 6 8 9 12 959542853 1886614883 1702064737 1299346243 1918980205 1232301409 1766154068
A ColInds:
11 6 4 12 11 10 2 13 3 2 8 11
B RowPtrs:
-1688500168 1 2 3 4 5 6 8 9 12 0 0 0 0 0 0 0
B ColInds:
11 6 4 12 11 10 2 13 3 2 8 11
cusparse fail: 6, line: 193

我们看到第一组 CSR 格式的 A 和 B 矩阵直到并包括以 Input size: 8 x 8 开头的行。 .. 似乎已完成且没有错误，但格式实际上已损坏。空行的行指针不从零开始(行指针不允许向后移动)，它们的行指针从最后填充的行开始(这样每行的非零元素数等于当前行指针减去前一行指针)，和行指针序列中的最后一个值指向矩阵中最后一个元素之后的一个元素(即 CSR 行指针数组中的最后一个值是 nnz，非零元素的数量)。

下一组A和B矩阵(对应16x16 pass)明显坏了。至少，A 和 B CSR 格式矩阵中的行指针明显超出范围。

您用于创建 CSR 矩阵的代码刚刚损坏。我建议您研究 CSR 矩阵并创建一个工具来验证您将以这种方式随机创建的任何矩阵。 CUSP具有矩阵验证功能，我相信您还可以使用其他 CSR 矩阵格式验证功能。

关于cuda - 使用 cuSPARSE 在 CUDA 中进行稀疏矩阵矩阵乘法，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/29688627/

文章推荐： php - 如何连接到 Wordpress 上的远程数据库

文章推荐： python - 使用 GAE Memcache API 缓存预渲染模板

文章推荐： javascript - 使用 AJAX 添加输入？

文章推荐： php - MySQLi:如何使用准备好的语句插入多个表

GIT - 稀疏 checkout 未按预期工作
我在服务器上 checkout 了一个 git 存储库。该存储库过去在根目录下包含所有相关文件，但我必须进行一些更改，现在我有两个文件夹，src 和 dist，我想跟踪这两个文件夹. 我遇到的问题是，
vulkan - 稀疏 VkDescriptorSetLayoutBinding 有哪些用例？
我很难弄清楚 VkDescriptorSetLayoutBinding::binding 的任何用例，这是结构: struct VkDescriptorSetLayoutBinding { u
python - 稀疏 python 向量的范数
Python中能否有效获取稀疏向量的范数？我尝试了以下方法: from scipy import sparse from numpy.linalg import norm vector1 = spa
python - Numpy 排序数组(稀疏)
我正在尝试找出为什么这段代码不对数组进行排序... 任意向量。 x = array([[3, 2, 4, 5, 7, 4, 3, 4, 3, 3, 1, 4, 6, 3, 2, 4, 3, 2]])
c - 稀疏 vector 的高效编码算法
有谁知道如何压缩(编码)稀疏 vector ？稀疏 vector 表示有许多“0”的 1xN 矩阵。例如 10000000000001110000000000000000100000000 上面是稀
matlab - 稀疏 GP 回归的初始种子
我使用稀疏高斯过程进行 Rasmussen 回归。[http://www.tsc.uc3m.es/~miguel/downloads.php][1] 预测平均值的语法是: [~, mu_1, ~, ~
java - 稀疏 vector ，它们是什么？
我在朴素贝叶斯分类器中使用 Mahout API。其中一个功能是 SparseVectorsFromSequenceFiles虽然我已经尝试过旧的谷歌搜索，但我仍然不明白什么是稀疏 vector 。最
c# - 在JavaScript稀疏数组和C#(稀疏)数组之间映射的推荐方法？
我正在尝试将JavaScript稀疏数组映射到C＃表示形式。建议这样做的方法是什么？它正在考虑使用一个字典，该字典包含在原始数组中包含值的原始词列表。还有其他想法吗？谢谢！最佳答案注意针
matlab - 解决*稀疏*上三角系统
如果我想求解一个完整上三角系统，我可以调用linsolve(A,b,'UT')。然而，这目前不支持稀疏矩阵。我该如何克服这个问题？最佳答案 UT 和 LT 系统是最容易解决的系统之一。读一读on t
python - Pandas :在没有行的地方添加零值(稀疏)
我有一个带有 MultiIndex 的 Pandas DataFrame。 MultiIndex 的值在 (0,0) 到 (1000,1000) 范围内，该列有两个字段 p 和 q. 但是，DataF
python - 稀疏(刚度)矩阵的高性能创建
我目前正在实现一个小型有限元模拟。使用 Python/Numpy，我正在寻找一种有效的方法来创建全局刚度矩阵: 1)我认为应该使用coo_matrix()从较小的单元刚度矩阵创建稀疏矩阵。但是，我可以
python - Numpy(稀疏)重复索引增量
a , b是 1D numpy ndarray与整数数据类型具有相同的大小。 C是一个 2D scipy.sparse.lil_matrix . 如果索引[a, b]包含重复索引，C[a, b] +=
algorithm - 找到(稀疏)图直径的好算法？
我有一个大的、连通的、稀疏的邻接表形式的图。我想找到两个尽可能远的顶点，即 diameter of the graph以及实现它的两个顶点。对于不同的应用程序，我对无向和有向情况下的这个问题都很感兴
c++ - 稀疏 x 密集矩阵乘法性能效率低下
上下文:我将 Eigen 用于人工神经网络，其中典型维度为每层约 1000 个节点。所以大部分操作是将大小为 ~(1000,1000) 的矩阵 M 与大小为 1000 的 vector 或一批 B v
Python保存一个(稀疏)矩阵，里面有一个变量
我有一些大小合适的矩阵 (2000*2000)，我希望在矩阵的元素中有符号表达式 - 即 .9**b + .8**b + .7**b ... 是一个元素的例子。矩阵非常稀疏。我通过添加中间计算来创建
c++ - 如何构造这个分块三对角(稀疏)矩阵？
在 R 或 C++ 中是否有一种快速填充(稀疏)矩阵的方法: A, B, 0, 0, 0 C, A, B, 0, 0 0, C, A, B, 0 0, 0, C, A, B 0, 0, 0, C, A
python - 稀疏 numpy 数组的局部敏感散列
我有一个大的稀疏 numpy/scipy 矩阵，其中每一行对应于高维空间中的一个点。我想进行以下类型的查询: 给定一个点P(矩阵中的一行)和一个距离epsilon，找到与epsilon距离最大的所有点
python - Scipy 稀疏 Cumsum
假设我有一个 scipy.sparse.csr_matrix 代表下面的值 [[0 0 1 2 0 3 0 4] [1 0 0 2 0 3 4 0]] 我想就地计算非零值的累积和，这会将数组更改为:
没有前导目录的 Git 稀疏 checkout
我了解如何在 Git 中配置稀疏 checkout ，但我想知道是否可以消除前导目录。例如，假设我有一个 Git 存储库，其文件夹结构如下: 文件夹1/foo 文件夹2/foo/bar/stuff 文
带有排除的 Git 稀疏 checkout
根据 this thread , Git 中的排除 sparse-checkout feature应该实现。是吗？假设我有以下结构: papers/ papers/... presentations

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

cuda - 使用 cuSPARSE 在 CUDA 中进行稀疏矩阵矩阵乘法