gpt4 book ai didi

c++ - CUDA : programming with twice as much blocks (tiling? )

转载 作者:太空狗 更新时间:2023-10-29 23:00:48 24 4
gpt4 key购买 nike

我的 3D Laplacian 解算器工作正常。我获得了 350 Gflop/s 的功率,我正在尝试升级它以获得更好的性能,同时使用两倍的 block 。然而,性能仍然是 350 Gflop/s:

 #include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <ctime>
#include"res3dcb.cuh"
#include <math.h>
using namespace std;

// Constant statement.
const int blocksize=32;
const int N=128;
const int size=(N+2)*(N+2)*(N+2)*sizeof(float);

// Let's start the main program.
int main(void) {

// Variable statement.
float time1,time2,time3;
float *x_d, *y_d;
float *x,*y;
float gflops;
float NumOps;
int power=4; // You can change power as you prefer (but keep 2^x)

// Init x and y.
x = new float[size];
y = new float[size];

for (int k=1;k<N+1;k++)
for (int i=1;i<N+1;i++)
for (int j=1;j<N+1;j++) {
x[k*(N+2)*(N+2)+i*(N+2)+j]=cos(i+j+k);
}

// Shadow cases.
for (int k=1;k<N+1;k++) {
for (int i=1;i<N+1;i++) {
x[k*(N+2)*(N+2)+i*(N+2)]=x[k*(N+2)*(N+2)+i*(N+2)+1];
x[k*(N+2)*(N+2)+i*(N+2)+N+1]=x[k*(N+2)*(N+2)+i*(N+2)+N];}

for (int j=0;j<N+2;j++) {
x[k*(N+2)*(N+2)+j]=x[k*(N+2)*(N+2)+(N+2)+j];
x[k*(N+2)*(N+2)+(N+1)*(N+2)+j]=x[k*(N+2)*(N+2)+N*(N+2)+j];}

for (int i=0;i<N+2;i++)
for (int j=0;j<N+2;j++) {
x[(N+2)*i+j]=x[(N+2)*(N+2)+(N+2)*i+j];
x[(N+1)*(N+2)*(N+2)+(N+2)*i+j]=x[(N+2)*(N+2)*N+(N+2)*i+j];
}

// Display of initial matrix.
int id_stage=-2;
while (id_stage!=-1) {
cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;

if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int i=0;i<N+2;i++) {
cout<<"| ";
for (int j=0;j<N+2;j++) {cout<<x[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}

// CPU to GPU.
cudaMalloc( (void**) & x_d, size);
cudaMalloc( (void**) & y_d, size);

cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;

// Solver parameters.
dim3 dimGrid(power*N/blocksize, power*N/blocksize);
dim3 dimBlock(blocksize, blocksize);

// Solver loop.
time1=clock();

res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);

time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;

// Power calculation.
NumOps=(1.0e-9)*N*N*N*7;
gflops = ( NumOps / (time3));

// GPU to CPU.
cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
cudaFree(x_d);
cudaFree(y_d);

// Display of final matrix.
id_stage=-2;
while (id_stage!=-1) {
cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;

if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int i=0;i<N+2;i++) {
cout<<"| ";
for (int j=0;j<N+2;j++) {cout<<y[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}

cout<<"Time : "<<time3<<endl;
cout<<"Gflops/s : "<<gflops<<endl;

}

地点:

__ global__ void res2d(volatile float* x, float* y, int N, int power) 
{
int i = threadIdx.x + blockIdx.x*(blockDim.x);
int j = threadIdx.y + blockIdx.y*(blockDim.y);
int id,jd;

#pragma unroll //Now let's recude the number of operation per block
for (int incr=1; incr<power+1; incr++) {
if (i>(incr-1)*N && i<incr*N && j>(incr-1)*N && j<incr*N) {
#pragma unroll
for (int k=(incr-1)*(N/power) ; k<incr*N/power ; k++) {
id=i-(incr-1)*N;
jd=j-(incr-1)*N;
y[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1] = x[(N+2)*(N+2)*(k+1)+(N+2)*(id+2)+jd+1]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*id+jd+1]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+2]
+ x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd]
+ x[(N+2)*(N+2)*(k+2)+(N+2)*(id+1)+jd+1]
+ x[(N+2)*(N+2)*k+(N+2)*(id+1)+jd+1]
- 6*x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1];
}
}
}
}

带参数:

dimGrid(power * N/blocksize, power * N/blocksize) & dimBlock(blocksize, blocksize)

问题:

  1. 如果power=2,48,每 block 的操作数被划分通过 248。但它并没有更快。为什么?

  2. 减少每个 block 的操作次数没有用吗?

预先感谢您的帮助。

最佳答案

CUDA 内核启动是异步的。当您这样做时:

 // Solver loop.
time1=clock();

res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);

time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;

计时器只捕获 API 启动延迟,而不是代码的实际执行时间。这就是为什么更改内核中完成的工作量显然对性能没有影响——您的计时方法不正确。

改为做这样的事情:

 // Solver loop.
time1=clock();

res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power);
cudaDeviceSynchronize();

time2=clock();
time3=(time2-time1)/CLOCKS_PER_SEC;

这会插入一个阻塞调用,以确保内核在测量时间之前完成执行。

[此答案添加为社区 wiki 条目,以便将问题从未回答的队列中删除]。

关于c++ - CUDA : programming with twice as much blocks (tiling? ),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/32741866/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com