c++ - CUDA : programming with twice as much blocks (tiling? )-6ren

c++ - CUDA : programming with twice as much blocks (tiling? )

转载作者：太空狗更新时间：2023-10-29 23:00:48

24

4

我的 3D Laplacian 解算器工作正常。我获得了 350 Gflop/s 的功率，我正在尝试升级它以获得更好的性能，同时使用两倍的 block 。然而，性能仍然是 350 Gflop/s:

 #include <iostream>
 #include <sys/time.h>
 #include <cuda.h>
 #include <ctime>
 #include"res3dcb.cuh"
 #include <math.h>
 using namespace std;

 // Constant statement.
 const int blocksize=32;
 const int N=128;
 const int size=(N+2)*(N+2)*(N+2)*sizeof(float);

 // Let's start the main program.
 int main(void) {

 // Variable statement.
 float time1,time2,time3;
 float *x_d, *y_d; 
 float *x,*y; 
 float gflops;
 float NumOps;
 int power=4; // You can change power as you prefer (but keep 2^x)

 // Init x and y. 
 x = new float[size];
 y = new float[size];

 for (int k=1;k<N+1;k++)
    for (int i=1;i<N+1;i++) 
        for (int j=1;j<N+1;j++) { 
            x[k*(N+2)*(N+2)+i*(N+2)+j]=cos(i+j+k);
        }

 // Shadow cases.
 for (int k=1;k<N+1;k++) {
    for (int i=1;i<N+1;i++) { 
      x[k*(N+2)*(N+2)+i*(N+2)]=x[k*(N+2)*(N+2)+i*(N+2)+1]; 
      x[k*(N+2)*(N+2)+i*(N+2)+N+1]=x[k*(N+2)*(N+2)+i*(N+2)+N];}

    for (int j=0;j<N+2;j++) { 
      x[k*(N+2)*(N+2)+j]=x[k*(N+2)*(N+2)+(N+2)+j]; 
      x[k*(N+2)*(N+2)+(N+1)*(N+2)+j]=x[k*(N+2)*(N+2)+N*(N+2)+j];}

 for (int i=0;i<N+2;i++) 
    for (int j=0;j<N+2;j++) {
        x[(N+2)*i+j]=x[(N+2)*(N+2)+(N+2)*i+j];
        x[(N+1)*(N+2)*(N+2)+(N+2)*i+j]=x[(N+2)*(N+2)*N+(N+2)*i+j];
    }

 // Display of initial matrix.
 int id_stage=-2;
 while (id_stage!=-1) {
     cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
     cin>>id_stage;
     cout<<endl;

     if (id_stage != -1) {
    cout<<"Etage "<<id_stage<<" du cube :"<<endl;
    for (int i=0;i<N+2;i++) {
        cout<<"| ";
        for (int j=0;j<N+2;j++) {cout<<x[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
        cout<<"|"<<endl;
        }
         cout<<endl;
     }
 }

 // CPU to GPU.
 cudaMalloc( (void**) & x_d, size);
 cudaMalloc( (void**) & y_d, size);

 cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
 cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;

 // Solver parameters.
 dim3 dimGrid(power*N/blocksize, power*N/blocksize);
 dim3 dimBlock(blocksize, blocksize);

 // Solver loop.
 time1=clock();

 res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power); 

 time2=clock();
 time3=(time2-time1)/CLOCKS_PER_SEC;

 // Power calculation.
 NumOps=(1.0e-9)*N*N*N*7;
 gflops = ( NumOps / (time3));

 // GPU to CPU.
 cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
 cudaFree(x_d);
 cudaFree(y_d);

 // Display of final matrix.
 id_stage=-2;
 while (id_stage!=-1) {
    cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
    cin>>id_stage;
    cout<<endl;

     if (id_stage != -1) {
        cout<<"Etage "<<id_stage<<" du cube :"<<endl;
        for (int i=0;i<N+2;i++) {
            cout<<"| ";
            for (int j=0;j<N+2;j++) {cout<<y[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
            cout<<"|"<<endl;
         }
        cout<<endl;
     }
 }

 cout<<"Time : "<<time3<<endl;
 cout<<"Gflops/s : "<<gflops<<endl;

 }

地点:

__ global__ void res2d(volatile float* x, float* y, int N, int power) 
{
    int i = threadIdx.x + blockIdx.x*(blockDim.x);
    int j = threadIdx.y + blockIdx.y*(blockDim.y);
    int id,jd;

    #pragma unroll //Now let's recude the number of operation per block
    for (int incr=1; incr<power+1; incr++) {
        if (i>(incr-1)*N && i<incr*N && j>(incr-1)*N && j<incr*N) {
            #pragma unroll
            for (int k=(incr-1)*(N/power) ; k<incr*N/power ; k++) {
                id=i-(incr-1)*N;
                jd=j-(incr-1)*N;
                y[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1] = x[(N+2)*(N+2)*(k+1)+(N+2)*(id+2)+jd+1] 
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*id+jd+1] 
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+2] 
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd] 
                                                       + x[(N+2)*(N+2)*(k+2)+(N+2)*(id+1)+jd+1] 
                                                       + x[(N+2)*(N+2)*k+(N+2)*(id+1)+jd+1] 
                                                       - 6*x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1];
            }   
        }
    }
}

带参数:

dimGrid(power * N/blocksize, power * N/blocksize) & dimBlock(blocksize, blocksize)

问题:

如果power=2,4或8，每 block 的操作数被划分通过 2、4 或 8。但它并没有更快。为什么？
减少每个 block 的操作次数没有用吗？

预先感谢您的帮助。

最佳答案

CUDA 内核启动是异步的。当您这样做时:

 // Solver loop.
 time1=clock();

 res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power); 

 time2=clock();
 time3=(time2-time1)/CLOCKS_PER_SEC;

计时器只捕获 API 启动延迟，而不是代码的实际执行时间。这就是为什么更改内核中完成的工作量显然对性能没有影响——您的计时方法不正确。

改为做这样的事情:

 // Solver loop.
 time1=clock();

 res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power); 
 cudaDeviceSynchronize();

 time2=clock();
 time3=(time2-time1)/CLOCKS_PER_SEC;

这会插入一个阻塞调用，以确保内核在测量时间之前完成执行。

[此答案添加为社区 wiki 条目，以便将问题从未回答的队列中删除]。

关于c++ - CUDA : programming with twice as much blocks (tiling? )，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/32741866/

24

4

0

文章推荐： java - 尝试创建 jar 时 list 文件中的行太长

文章推荐： c# - 如何从 C# 中的函数指针获取函数名称？

文章推荐： java ->> 在 Java 中做什么？

文章推荐： c# - 没有排序规范的 List<>.Sort() 需要编译时错误

google-maps - 如何在不指定缩放级别(或 LevelOfDetails)的情况下获取 Tile Count、Tile X、Tile Y 详细信息？
这是引用 Google Tile Map 或 Bing Maps。是否可以在不使用任何类型的内部计算指定缩放级别(或 LevelOfDetails)的情况下获得 Tile Count、Tile X、T
Spring & Tiles - 摆脱 tiles.xml
我有一个 Tiles 模板，其中每个页面都提供一个标题，一些要添加到中的内容，要放入一些混凝土的东西，以及附加到的内容在一切之后。大部分内容都非常小，因为页面是用 JS 呈现的。我怎样才
tiles - Apache Tiles 2.1 - 如何防止继承的列表属性重复？
我正在使用 Apache Tiles 2.1 开展一个项目。我遇到了一个问题，即使用列表属性扩展模板会创建这些列表项的重复项...每个继承级别都有一组重复项。作为示例，以下是基本定义及其将生成的页
html - 重复背景 : big tile or small tile?
我在 HTML 的背景层中有一个重复的水平图案。我可以使用 1 像素宽的图像或它的倍数(10、20、50...)来实现。问题是: 哪个更好？使用薄(小文件大小)图像并使其重复很多使用更大的图片，
tiles - Apache Tiles 3.0 - 本地保存和引用 DTD
我们正在使用 Apache Tiles 3.0。在我们的 Apache Tiles-Def 文件中，我们偶尔会遇到此 DTD 引用的问题，这可能是因为该站点不可靠。偶尔会出现“Reading Def
java - 我是否必须在 Apache Tiles 中的每个 tile 中包含库？
我在我的网络应用程序中使用 Tiles。我在瓷砖中使用了标准布局 (standard.jsp)。在 standard.jsp 之上有很多包含，涉及标签库等。让我们做一个简化的例子。标准.jsp:
c++ - 如何为基于 2D tile 的游戏定义 tile 结构？
我正在制作一个基于 2d tile 的游戏，我试图通过对 tile 结构中的值使用位字段和字节来保持我的 tile 结构小: struct Tile { // 3 bytes (24 bits
javascript - 我在 Tiled 中制作的 Tile map 未在浏览器中加载
我已经在 Tiled 中制作了一张 map ，并且已经生成了一个 JSON。每当我尝试在 Chrome 中加载 map 时，它根本无法加载。当我去检查网站时，有类似警告: 在来自 Tiled 的 Js
jsp - Struts2 Tiles 将 var 设置为 tiles 属性作为字符串
我使用 struts 2.3.16 和 tiles 2.0.6。具有空属性(以及其他属性)的图 block 定义: 一些其他定义扩展了它，他们可以选择填入值: 或者也留空。我试图通过使用 st
java - 瓷砖 :insert and tiles:get in tiles framework? 之间有什么区别
我想在Struts2框架中使用tile。作为用于显示图 block 内容的标签，tiles:insert 和tiles:get 之间的基本区别是什么？最佳答案 is equivalent to
java - 将 Struts-Tiles 迁移到 Spring+tiles 3
我正在从 Strut1 + Tiles 项目迁移到 SpringMVC 和 Apache Tiles 3。我对Struts1+Tiles只了解一点点，它太旧了，我陷入了Struts-tiles中的Co
spring - 找不到 "http://tiles.apache.org/tags-tiles"的标签库描述符
我正在使用Tiles，Spring和Hibernate创建一个应用程序。在运行时，它显示以下错误: Can not find the tag library descriptor for "http
windows-phone-8 - 使用 Flip tile 实现循环 tile 行为。
目前我在计划任务的帮助下实现了我的翻转图块，因此类似方法的一个问题是翻转图块将在时间间隔内翻转相同的图像。所以我想要实现的是我需要像循环瓷砖一样翻转我的图像。即一个接一个的方式。这里要注意的一件事是我
ios - 适用于 iPhone 的 Tiled 和 Cocos2D : How do you animate tiles?
我正在使用 v0.6.2 和 Cocos2D v0.99.5，现在我真的非常需要为我的一些图 block 制作动画。我读了part of a wiki关于如何为图 block 制作动画，但它似乎与 M
java - 为 Spring MVC 中包含 Tile 的任何 Tiles 布局提供对象
我有一个图 block ，其中列出了提供给它的对象中的文章。此图 block 是大多数页面的一部分，但不是全部页面。有没有什么方法可以自动仅向需要它的页面提供对象(包含特定图 block )？现在我只
c# - Unity 2D Tile Map -- Group Tiles By Type C#
我正在寻找一种解决方案，以根据游戏 block 的类型对我的游戏 block 进行分组。瓷砖存储在二维数组中，类型为空和水，分组的瓷砖将存储在组类中。所以如果我有二维数组: 0, 0, 0, 0,
java - 无法读取工件 'org.apache.tiles:tiles-core:jar' 的元数据文件
我在今天的 maven 构建过程中遇到以下错误。 Unable to resolve artifact: Unable to get dependency information: Unable to
tiles - 让 Tiles 的 put-list-attribute 与 Thymeleaf 模板一起使用
我正在尝试让 Apache Tiles' put-list-attribute 与 Thymeleaf 一起工作。这是我尝试过的: 来自 Tiles 配置: 来自 thymlea
html - 有没有办法突出显示点击的 mat-grid-tile？当动态生成每个 mat-grid-tile 时
我在 UI 的 mat-grid-list 中动态显示 mat-grid-tile，其数量和数据随着后端值的变化而变化。它们是动态生成的。单击任何 mat-grid-tile 都会调用一个函数并将数据
ios - Sprite Kit 使用 16x16 tiles 与 32x32 tiles 渲染性能？
我制作了一个 Tiled 游戏。现在，我正在通过增加场景中的节点数量来对手机的功能进行压力测试。有基于物理的东西，AI 运动，日夜系统，粒子在这里和那里突然出现以及我的场景的引擎盖下发生的许多其他事情

首页

博学

6Ren·AI

商城

c++ - CUDA : programming with twice as much blocks (tiling? )

问题: