c++ - CUDA 中的并行批处理小矩阵不适用于 for 循环-6ren

c++ - CUDA 中的并行批处理小矩阵不适用于 for 循环

转载作者：行者123 更新时间：2023-12-03 06:53:34

我有一些(比如说一百万个)4 x 3 的小矩阵。我想用它们做几个简单的操作，我希望我的 CUDA 内核只并行化矩阵索引，(不是行/列操作).让我更好地解释一下:我将矩阵数组 A[MatrixNumb][row][col] 作为输入传递给我的 GPU 内核，我希望操作并行化仅在 MatrixNumb 上进行(因此我想强制操作在一个维度。为了简单起见，下面的示例仅使用 3 个矩阵。它编译并运行，但它给了我错误的结果。基本上，它返回的矩阵与我作为输入提供的矩阵相同。我不明白为什么以及我是否正在制作任何错误，我如何重写/思考代码？我也使用 CudaMallocManaged 编写代码，以便在主机和设备之间共享内存，但是它使用经典的 CudaMalloc 和使用 memcpy 给我相同的结果。

源.cpp

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <assert.h>
#include <chrono>
#include <random>
#include <time.h>
#include <math.h>

#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>

using namespace std;


__global__ void SVD(double*** a, const int m, const int n, const int numMatrices, double** w)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  // I would like that each thread runs these loops independently
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      a[idx][i][j] = (a[idx][i][j] * a[idx][i][j]) * 3.14;
    }
  }
  for (int j = 0; j < n; j++) {
    w[idx][j] = 3.14 * a[idx][1][j]* a[idx][1][j];
  }

}


int main()
{
  const int n = 3;
  const int m = 4;
  const int lda = m;
  const int numMatrices = 3;

  random_device device;
  mt19937 generator(device());
  uniform_real_distribution<double> distribution(1., 5.);

  // create pointers
  double*** A = new double** [numMatrices];
  double** w = new double* [numMatrices];

  //ALLOCATE SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    A[nm] = new double* [lda];
    w[nm] = new double[n];

    for (int i = 0; i < lda; i++) {
      A[nm][i] = new double[n];

      for (int j = 0; j < n; j++) {
        cudaMallocManaged((void**)&A[nm][i][j], sizeof(double));
        cudaMallocManaged((void**)&w[nm][j], sizeof(double));
      }
    }
  }

  cout << " memory allocated" << endl;


  //FILL MATRICES INTO SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    A[nm] = new double* [lda];
    w[nm] = new double[n];                                   

    for (int i = 0; i < lda; i++) {
      A[nm][i] = new double[n];

      for (int j = 0; j < n; j++) {
        A[nm][i][j] = distribution(generator);
        w[nm][j] = 0.0;
      }
    }
  }
  cout << " matrix filled " << endl;


  // PRINT MATRICES BEFORE CUDA OPERATION
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  //KERNEL ----------------------------------------------------------------------
  int NThreads = 3;   
  int NBlocks = int(numMatrices / NThreads + 1);
 
  SVD << <NBlocks, NThreads >> > (A, n, m, numMatrices, w);
  cudaDeviceSynchronize();
  cout << " Kernel done " << endl << endl;

  cout << " --- GPU --- " << endl;
  cout << " NEW MATRIX: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  cout << " NEW VECTOR RESULTS: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < n; i++) {
      cout << w[nm][i] << " ";
    }
    cout << endl;
  }

  cout << endl;

  //FREE THE DEVICE'S MEMORY -----------------------------------------------------
  cudaFree(A);
  cudaFree(w);
  cout << " Cuda free " << endl << endl;

  return 0;
}

我得到的(错误的)输出如下:

memory allocated
 matrix filled
1.28689 3.76588 3.88649
1.52547 4.42371 2.62566
1.48002 3.33719 1.58413
3.78243 2.8394 3.0249

1.14322 1.70261 2.02784
2.86852 2.87918 3.2896
4.87268 3.52447 1.58414
3.52306 3.84931 3.18212

1.76397 1.41317 4.9765
1.63338 4.79316 2.64009
1.99873 1.72617 1.15974
1.18922 4.21513 1.6695

 Kernel done

 --- GPU ---
 NEW MATRIX:
1.28689 3.76588 3.88649
1.52547 4.42371 2.62566
1.48002 3.33719 1.58413
3.78243 2.8394 3.0249

1.14322 1.70261 2.02784
2.86852 2.87918 3.2896
4.87268 3.52447 1.58414
3.52306 3.84931 3.18212

1.76397 1.41317 4.9765
1.63338 4.79316 2.64009
1.99873 1.72617 1.15974
1.18922 4.21513 1.6695

 NEW VECTOR RESULTS:
0 0 0
0 0 0
0 0 0

 Cuda free

我期望通过以下操作修改新矩阵和 vector :a[idx][i][j] = (a[idx][i][j] * a[idx][i][j]) * 3.14;但是，看起来代码没有看到内核或内核无法正常工作。

最佳答案

你有几个问题:

当使用具有双指针或三指针访问的托管内存时，树中的每个指针都必须使用托管分配器进行分配
您的分配方案层次太多，并且您将一些指针分配了两次(内存泄漏)。
您传递给内核的参数顺序与内核期望的参数顺序不匹配(n、m 倒转)。
由于您可能会启动比必要更多的 block /线程，因此您的内核需要进行线程检查 (if-test)。
您的代码应该在 .cu 文件中，而不是 .cpp 文件中。

以下代码解决了上述问题，并且运行时似乎没有运行时错误。

$ cat t61.cu
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iostream>
#include <assert.h>
#include <chrono>
#include <random>
#include <time.h>
#include <math.h>


using namespace std;


__global__ void SVD(double*** a, const int m, const int n, const int numMatrices, double** w)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < numMatrices){
  // I would like that each thread runs these loops independently
  for (int i = 0; i < m; i++) {
    for (int j = 0; j < n; j++) {
      a[idx][i][j] = (a[idx][i][j] * a[idx][i][j]) * 3.14;
    }
  }
  for (int j = 0; j < n; j++) {
    w[idx][j] = 3.14 * a[idx][1][j]* a[idx][1][j];
  }
  }
}


int main()
{
  const int n = 3;
  const int m = 4;
  const int lda = m;
  const int numMatrices = 3;

  random_device device;
  mt19937 generator(device());
  uniform_real_distribution<double> distribution(1., 5.);

  // create pointers
  double*** A;
  cudaMallocManaged(&A, sizeof(double**)*numMatrices);
  double** w;
  cudaMallocManaged(&w, sizeof(double*)* numMatrices);

  //ALLOCATE SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    cudaMallocManaged(&(A[nm]), sizeof(double*)*lda);
    cudaMallocManaged(&(w[nm]), sizeof(double)*n);

    for (int i = 0; i < lda; i++) {
      cudaMallocManaged(&(A[nm][i]), sizeof(double)*n);
      }
    }

  cout << " memory allocated" << endl;


  //FILL MATRICES INTO SHARED MEMORY
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        A[nm][i][j] = distribution(generator);
        w[nm][j] = 0.0;
      }
    }
  }
  cout << " matrix filled " << endl;


  // PRINT MATRICES BEFORE CUDA OPERATION
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  //KERNEL ----------------------------------------------------------------------
  int NThreads = 3;
  int NBlocks = int(numMatrices / NThreads + 1);

  SVD << <NBlocks, NThreads >> > (A, m, n, numMatrices, w);
  cudaDeviceSynchronize();
  cout << " Kernel done " << endl << endl;

  cout << " --- GPU --- " << endl;
  cout << " NEW MATRIX: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < lda; i++) {
      for (int j = 0; j < n; j++) {
        cout << A[nm][i][j] << " ";
      }
      cout << endl;
    }
    cout << endl;
  }

  cout << " NEW VECTOR RESULTS: " << endl;
  for (int nm = 0; nm < numMatrices; nm++) {
    for (int i = 0; i < n; i++) {
      cout << w[nm][i] << " ";
    }
    cout << endl;
  }

  cout << endl;

  //FREE THE DEVICE'S MEMORY -----------------------------------------------------
  cudaFree(A);
  cudaFree(w);
  cout << " Cuda free " << endl << endl;

  return 0;
}
$ nvcc -o t61 t61.cu
$ cuda-memcheck ./t61
========= CUDA-MEMCHECK
 memory allocated
 matrix filled
3.73406 3.51919 3.249
1.52374 2.678 2.50944
3.67358 1.15831 3.26327
2.58468 1.49937 2.67133

1.72144 2.99183 3.11156
1.06247 3.34983 4.23568
3.49749 3.07641 3.42827
4.09607 2.00557 2.12049

3.65427 3.98966 4.73428
1.68397 4.3746 2.95533
2.1914 4.96086 1.7165
3.10095 2.61781 4.52626

 Kernel done

 --- GPU ---
 NEW MATRIX:
43.7816 38.888 33.1458
7.29041 22.5191 19.7735
42.375 4.2129 33.4376
20.977 7.05908 22.407

9.30494 28.1062 30.4008
3.54453 35.2351 56.3348
38.41 29.7179 36.9045
52.6821 12.6301 14.1189

41.9306 49.9807 70.3782
8.90432 60.0905 27.4247
15.079 77.2757 9.25165
30.1939 21.5182 64.3294

 NEW VECTOR RESULTS:
166.891 1592.32 1227.71
39.4501 3898.35 9965.14
248.961 11338.1 2361.64

 Cuda free

========= ERROR SUMMARY: 0 errors
$

关于c++ - CUDA 中的并行批处理小矩阵不适用于 for 循环，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/64414577/

文章推荐： c++ - 分配顺序产生不同的装配

文章推荐： c++ - 将 std::decay 与 std::forward 一起使用

文章推荐： c++ - 执行 SHA256 算法未返回预期结果

文章推荐： c++ - 如何使用初始化列表构造函数以外的东西为类成员设置默认值

java - 用于 DMG 背景的自定义插入式图标，用于 MacOSX 上的应用程序打包
我在为 MacOSX 构建的独立包中添加 DMG 背景的自定义图标时遇到问题。我在项目的根目录中添加了一个包。正在从中加载自定义图标，但没有加载 DMG 背景图标。我正在使用 Java fx 2.2.
用于 Symbian VS 的 Qt。用于 MeeGo 的 Qt
Qt for Symbian 和 Qt for MeeGo 有什么区别？我知道 Qt 是一个交叉编译平台。这是否意味着如果我使用来自 Qt 的库，完全相同的库可以在所有支持 Qt 的设备(例如 Sym
用于 SQL 管理的 c# 命名空间，用于 sql server 备份
我正在尝试使用 C# .NET 3.5/4.0 务实地运行 SQL Server 数据库的备份。我已经找到了如何完成此操作，但是我似乎找不到用于备份的命名空间库。我正在寻找 Microsoft.Sq
java - 用于 Java 的开发服务器，就像 VS 用于 .NET 一样？
我最近在疯狂学习 Java，但我通常是一名 .NET 开发人员。 (所以请原谅我的新手问题。) 在 .Net 中，我可以在不使用 IIS 的情况下开发 ASP.Net 页面，因为它有一个简化的 Web
python - 正则表达式 (vim) 用于 print ... to print(...) 用于 python2 到 python3
这post仅当打印命令中有字符串时才有用。现在我有大量的源代码，其中包含一条声明，例如 print milk,butter 应该格式化为 print(milk,butter) 用\n 捕获行尾并不成功
ruby-on-rails - 未定义方法 `updated?' 用于 HasOneAssociation ，用于 Rails 4 中的嵌套属性
所以我的问题是: https://gist.github.com/panSarin/4a221a0923927115584a 当我保存这个表格时，我收到了标题中的错误 NoMethodError (u
javascript - 如何让 Html5 音频在点击时播放声音？ (ogg 用于 Firefox 等浏览器，mp3 用于 chrome 等浏览器)
如何让 Html5 音频在点击时播放声音？ (ogg 用于 Firefox 等浏览器，mp3 用于 chrome 等浏览器) 到目前为止，我可以通过 onclick 更改为单个文件类型，但我无法像在普
c++ - 将 .begin() 与 .end() 用于 std::inserter 用于 std::set 之间有区别吗？
如果it1和it2有什么区别？ std::set s; auto it1 = std::inserter(s, s.begin()); auto it2 = std::inserter(s, s.en
java - 我正在将 SpringMVC 用于 Web 应用程序，并将 sessionFactory 用于 Hibernate。以下是我正在使用的 pom.xml
4.0.0 com.amkit myapp SpringMVCFirst
javascript - 用于 ECMAScript-262 的 IDE，用于 node.js/V8 的 IDE 执行/调试
我目前使用 Eclipse 作为其他语言的 IDE，而且我习惯于不必离开 IDE 做任何事情 - 但是我真的很难为纯 ECMAScript-262 找到相同或类似的设置。澄清一下，我不是在寻找 DO
c# - 将带有字符串数组的 C# 结构传递给 c++ 函数，该函数接受 void * 用于 c# 结构和 char** 用于 c# 字符串数组
我想将带有字符串数组的C# 结构发送到C++ 函数，该函数接受void * 作为c# 结构和char** 作为c# 结构字符串数组成员。我能够将结构发送到 c++ 函数，但问题是，无法从 c++ 函
用于:param的JSF转换器
我正在使用动态创建的链接: 我想为f:param附加自定义转换器，以从＃{name}等中删除空格。但是f:param中没有转换器
.net - 用于.NET的写后缓存的Redis可扩展性
是否可以利用Redis为.NET创建后写或直写式缓存？理想情况下，透明的高速缓存是由单个进程写入的，并且支持从数据库加载丢失的数据，并每隔一段时间持久保存脏块？我已经搜查了好几个小时，也许是goog
bash - 多行ssh命令，用于
我正在通过bash执行命令的ssh脚本。 FILENAMES=( "export_production_20200604.tgz" "export_production_log_2020060
java - 用于 OR 两个范围正则表达式的正则表达式
我需要一个正则表达式来出现 0 到 7 个字母或 0 到 7 个数字。例如:匹配:1234、asdbs 不匹配:123456789、absbsafsfsf、asf12 我尝试了([a-zA-Z]{0
sql - 用于 BETWEEN 的日期列上的非聚集索引
我有一个用于会计期间的表格，该表格具有期间结束和开始的开始日期和结束日期。我使用此表来确定何时发生服务交易以及何时在查询中收集收入，例如... SELECT p.PeriodID, p.FiscalY
用于 Laravel 验证的仅接受单词或字母的正则表达式组合
我很难为只接受字符或数字的 Laravel 构建正则表达式验证。它是这样的: 你好<-好的 123 <- 好的你好123 <-不行我现在的正则表达式是这样的:[A-Za-z]|[0-9]。 reg
c# - 用于 OnItemDataBound
您实际上会在 Repeater 上使用 OnItemDataBound 做什么？最佳答案 “此事件为您提供在客户端显示数据项之前访问数据项的最后机会。引发此事件后，数据项将被清空，不再可用。” ~
用于 fragment 上自定义列表的android空指针异常设置适配器
我有一个 fragment 工作正常的项目，我正在使用 jeremyfeinstein 的 actionbarsherlock 和滑动菜单，一切正常，但是当我想自定义左侧抽屉列表单元格时，出现异常
iOS:用于 TabBar
最近几天，我似乎平均分配时间在构建我的第一个应用程序和在这里发布问题!! 这是我的第一个应用程序，也是我们的设计师完成的第一个应用程序。我试图满足他所做的事情的外观和感觉，但我认为他没有做适当的事情。

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c++ - CUDA 中的并行批处理小矩阵不适用于 for 循环