Cuda内核没有并发运行-6ren

Cuda内核没有并发运行

转载作者：太空宇宙更新时间：2023-11-04 04:38:58

最初我问的是，由于某种原因，当我指定不同的流时，我的内核拒绝并发运行。这个问题现在已经解决了，但是我仍然不清楚它们的并发行为。

~~我知道我的系统可以运行多个流，因为 concurrentKernels CUDA 示例运行良好。我还可以扩展此示例，使其模仿我的代码并且它仍然同时运行。~~ 提前为大量代码道歉。我想将其全部发布，因为可能有一件小事阻止了我的内核同时运行，或者我认为这可能与具有结构或许多单独的文件有关。此外，我相信在尝试帮助我时它对你们所有人都有用! 我刚刚编写了以下简化程序来复制我的问题:

测试主程序

#include <stdlib.h>
#include <signal.h>
#include "test.h"

#define Nsim 900000
#define Ncomp 20

Vector* test1;
Vector* test2;
Vector* test3;

cudaStream_t stream1;
cudaStream_t stream2;
cudaStream_t stream3;

int
main (int argc, char **argv)
{
    test1 = Get_Vector(Nsim);
    test2 = Get_Vector(Nsim);
    test3 = Get_Vector(Nsim);

    checkGPU( cudaStreamCreate(&stream1) );
    checkGPU( cudaStreamCreate(&stream2) );
    checkGPU( cudaStreamCreate(&stream3) );

    int x = 0;
    for (x = 0; x < Ncomp; x++)
    {
      computeGPU(test1, test2, test3, x);
      checkGPU( cudaThreadSynchronize() );
    }
    checkGPU( cudaThreadSynchronize() );

    checkGPU( cudaStreamDestroy(stream1) );
    checkGPU( cudaStreamDestroy(stream2) );
    checkGPU( cudaStreamDestroy(stream3) );

    Free_Vector(test1);
    Free_Vector(test2);
    Free_Vector(test3);

    checkGPU( cudaDeviceReset() );
    exit(EXIT_SUCCESS);
}

基础.c

#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include "basics.h"

inline void gpuAssert(cudaError_t code, const char *file, int line)
{
  if (code != cudaSuccess) 
    {
      fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
      exit(EXIT_FAILURE);
    }
}

基础.h

#ifndef _BASICS_H
#define _BASICS_H

#include <cuda_runtime.h>

#define checkGPU(ans) { gpuAssert((ans), __FILE__, __LINE__); }

void gpuAssert(cudaError_t code, const char *file, int line);

#endif // _BASICS_H

测试.cu

extern "C"
{
#include "test.h"
}

__global__ void compute(int* in, int x)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  in[i] = (int) (x * + 1.05 / 0.4);
}

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x);
}

测试.h

#ifndef _TEST_H
#define _TEST_H

#include "vector.h"
#include "basics.h"
#include <cuda_runtime.h>

extern cudaStream_t stream1;
extern cudaStream_t stream2;
extern cudaStream_t stream3;

extern void computeGPU(Vector* in1, Vector* in2, Vector* in3, int x);

#endif // _TEST_H

vector .c

#include <stdlib.h>
#include "vector.h"
#include "basics.h"

Vector*
Get_Vector(int N)
{
  Vector* v = (Vector*) calloc(1, sizeof(Vector));
  v->N = N;
  checkGPU( cudaMalloc((void**) &v->d_data, N * sizeof(int)) );
  return v;
}

void
Free_Vector(Vector* in)
{
  checkGPU( cudaFree(in->d_data) );
  free(in);
}

vector .h

#ifndef _VECTOR_H
#define _VECTOR_H

typedef struct
{
    int N;
    int* d_data;
} Vector;

extern Vector* Get_Vector(int N);

extern void Free_Vector(Vector* in);

#endif // _VECTOR_H

我编译:

nvcc -gencode arch=compute_20,code=sm_20 -O3 -use_fast_math -lineinfo -o test testMain.c test.cu basics.c vector.c; time ./test

并让单独的内核在 nvvp 中运行:

Kernels running serially instead of concurrently.

在 Roberts 的帮助下，我通过减少 Nsim 解决了这个问题。

如果像我的问题一样 Nsim 很大 (900000)，则 GPU 充满了 block ，因此即使在单独的流中指定，也无法同时运行我的内核。配置文件结果如上。
如果 Nsim 很小 (900)，理论上内核可以同时运行，但是我的内核非常简单，它们完成得比启动下一个内核的开销要快，因此整个模拟只是 Launch Compute(int* ,int,int) 在 RuntimeAPI 行中。配置文件结果如下所示
如果我更改我的内核和代码，使内核运行时间更长(并将 Nsim 设置为合理的值，3000，现在不重要):

测试.cu

__global__ void compute(int* in, int x, int y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  in[i] = (int) (x * + 1.05 / 0.4);

  int clock_count = 5000000 * y;
  clock_t start_clock = clock();
  clock_t clock_offset = 0;
  while (clock_offset < clock_count)
  {
    clock_offset = clock() - start_clock;
  }
}

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x, 2);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x, 3);
}

我的内核现在并发运行，等待三个内核完成，然后再启动下三个内核，因为我在我的循环中同步: kernels running concurrently

但是，如果通过以下更改启动我的内核，我希望因为我在循环中启动我的所有内核，然后然后同步，所有内核都应该背靠背运行，而最快的只是完成运行的 1/3，第二个 2/3，最后一个和终点。这里发生了什么？ CUDA 是否在施展魔法，意识到它无论如何都必须等待长内核完成，以便以某种方式更加优化以穿插运行其他内核？内核全部启动，运行时仅等待一个同步(这可以在 RuntimeAPI 行中看到)。

测试主程序

int x = 0;
for (x = 0; x < Ncomp; x++)
{
  computeGPU(test1, test2, test3, x);
  //checkGPU( cudaThreadSynchronize() );
}
checkGPU( cudaThreadSynchronize() );

kernels running concurrent but not as expected

此外，使用以下命令启动内核非常困惑，并非预期的那样。当然，它们可以比这更好地同步，两个内核花费相同的时间运行(1x3 和 3x1)，另一个正好适合在某个地方运行这些内核的时间。

测试.cu

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x, 2);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x, 3);
}

confusing results

最佳答案

http://on-demand.gputechconf.com/gtc-express/2011/presentations/StreamsAndConcurrencyWebinar.pdf

请查看幻灯片 18，了解有关提交并发内核的有效顺序的说明。

有音频: https://developer.nvidia.com/gpu-computing-webinars

寻找 cuda 并发和流。

关于Cuda内核没有并发运行，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/28364356/

文章推荐： Linux 打印状态检查脚本

文章推荐： python - 如何在运行时替换或修改 Tornado 处理程序？

文章推荐： linux - 使用 Perl 如何清理没有文件的剩余目录？

文章推荐： css - 站点地图的 float div 请查看图片

angular - 从批处理文件运行一组命令(运行 VSCode、运行 NG 服务)
好的，所以我想从批处理文件运行我的整个工作环境... 我想要实现什么...... 打开新的 powershell，打开我的 API 文件夹并从该文件夹运行 VS Code 编辑器(cd c:\xy;
单击“运行”按钮时，iOS Xcode 项目不会构建/运行
我正在查看 Cocoa Controls 上的示例并下载了一些演示。我遇到的问题是一些例子，比如 BCTabBarController ，不会在我的设备上构建或启动。当我打开项目时，它看起来很正常，没
c - 运行 C — helloWorld 运行，但没有其他内容 — Ubuntu
我刚刚开始学习 C 语言(擅长 Java 和 Python)。当编写 C 程序(例如 hello world)时，我在 ubuntu cmd 行上使用 gcc hello.c -o hello 编译
php - 从 cron 运行 php 没有作为 CLI 运行
我在 php 脚本从 cron 开始运行到超时后注意到了这个问题，但是当它从命令行手动运行时这不是问题。 (对于 CLI，PHP 默认的 max_execution_time 是 0) 所以我尝试运行
node.js - 如何通过 IntelliJ 运行/调试配置让 wdio 运行？
我可以使用命令行运行测试 > ./node_modules/.bin/wdio wdio.conf.js 但是如果我尝试从 IntelliJ 的运行/调试配置运行它，我会遇到各种不同的错误。 Fea
java - 从 python 运行 bat 文件会返回错误，而直接从 cmd 运行
Error occurred during initialization of VM. Could not reserve enough space for object heap. Error: C
python - 无法从 anaconda 运行 jupyter 笔记本，但可以从 python 运行
将 Anaconda 安装到 C:\ 后，我无法打开 jupyter 笔记本。无论是在带有 jupyter notebook 的 Anaconda Prompt 中还是在导航器中。我就是无法让它工作。
Python 脚本通过双击和 IDLE 运行，但不通过 Windows CMD shell 运行
我遇到一个问题，如果我双击我的脚本 (.py)，或者使用 IDLE 打开它，它将正确编译并运行。但是，如果我尝试在 Windows 命令行中运行脚本，请使用 C:\> "C:\Software_Dev
php - 查询从 postman 和 phpmyadmin 运行，但不是从 android 运行
情况我正在使用 mysql 数据库。查询从 phpmyadmin 和 postman 运行但是当我从 android 发送请求时(它返回零行) 我已经记录了从 android 发送的电子邮件是正确
java - 从 Java 运行 .exe 会提供与直接从 Windows 运行 .exe 不同的控制台输出
所以这个有点奇怪 - 为什么从 Java 运行 .exe 文件会给出不同的输出而不是直接运行 .exe。当 java 在下面的行执行时，它会调用我构建的可与 3CX 电话系统配合使用的 .exe 文
c# - 应用程序在 Visual Studio 的单元测试中以 x86 运行，但在独立时以 x64 运行
这行代码 Environment.Is64BitProcess 当我的应用单独运行时评估为真。但是当它在我的 Visual Studio 单元测试中运行时，相同的表达式的计算结果为 false。我
javascript - 使用 JQuery 运行 AJAX 和使用普通 XMLHttpRequest 运行 AJAX 有什么区别？
关闭。这个问题是opinion-based .它目前不接受答案。想要改进这个问题？更新问题，以便 editing this post 可以用事实和引用来回答它. 关闭 8 年前。 Improve
c - 为什么我的 C 程序可以在 "git bash"运行，但不能在 "cmd"运行？
我写了一个使用 libpq 连接到 PostgreSQL 数据库的演示。我尝试通过包含将 C 文件连接到 PostgreSQL #include 在我将路径添加到系统变量 I:\Program F
java - 从 Jenkins 运行 Android 模拟器以使用 Robotium 运行 Junit 测试
如何从 Jenkins 运行 Android 模拟器来运行我的测试？当我在 Execiute Windows bath 命令中写入时，运行模拟器的命令: emulator -avd Tester 然后
ruby-on-rails - 使用 ngninx 运行 errbit，使用 ssl 运行 passenger
我已经配置好东西，这样我就可以使用 ssl 登录和访问在 nginx 上运行的 errbit 我的问题是我不知道如何设置我的 Rails 应用程序的 errbit.rb 以便我可以运行测试 nginx
ios - flutter app 不是由 flutter build ios 运行，而是由 xcode 运行
我编写了 flutter 应用程序，我通过 xcode 打开了 ios 部分并且应用程序正在运行，但是当我通过 flutter build ios 通过 vscode 运行应用程序时，我得到了这个错误
python - 我的 python 脚本通过我的 IDE (PyCharm) 运行，但无法使用 Python shell 运行
我有一个简短的 python 脚本，它使用日志记录模块和 configparser 模块。我在Win7下使用PyCharm 2.7.1和Python 3.3。当我使用 PyCharm 运行我的脚本时
c# - .NET 2005 - 通过 IIS 的测试作为 x86 运行。单元测试以 x64 运行
我在这里遇到了一些难题。我的开发箱是 64 位的，windows 7。我所有的项目都编译为“任何 CPU”。该项目引用了 64 位版本的第 3 方软件当我运行不使用任何 Web 引用的单元测试时，
c++ 相同的代码从不在 Visual Studio 中编译/运行，有时在 Qt Creator 中编译/运行
当我注意到以下问题时，我正在做一些 C++ 练习。给定的代码将不会在 Visual Studio 2013 或 Qt Creator 5.4.1 中运行/编译报错: invalid types 'd
airflow - 运行 dag 并让 Airflow 运行 : error: the following arguments are required: task_id,execution_date
假设我有一个 easteregg.py 文件: from airflow import DAG from dateutil import parser from datetime import tim

太空宇宙

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

Cuda内核没有并发运行