gpt4 book ai didi

cudaMallocManaged 和 cudaDeviceSynchronize()

转载 作者:行者123 更新时间:2023-12-03 08:51:23 28 4
gpt4 key购买 nike

我有以下两个几乎相同的示例代码。 code1.cu 使用 cudaMalloccudaMemcpy 处理设备/主机变量值交换。

code2.cu使用cudaMallocManaged,因此不需要cudaMemcpy。当使用 cudaMallocManaged 时,我必须包含 cudaDeviceSynchronize() 才能获得正确的结果,而对于使用 cudaMalloc 的结果,则不需要这样做。我希望能得到一些关于为什么会发生这种情况的提示

code2.cu

#include <iostream>
#include <math.h>
#include <vector>
//

using namespace std;


// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}


int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMallocManaged((void **)&d_tot, sizeof(float));

// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;

// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);

for (int i = 0; i < Ny; i++)
vy.push_back(i*10);

//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];

cout<<"CPU: tot: "<<tot<<endl;


//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);

//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);

//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);

cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;

// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);

int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);

float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);

printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);


// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);

// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();

tot =*d_tot;
//

//
cout<<" GPU: tot: "<<tot<<endl;
// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);

return 0;
}

code1.cu

#include <iostream>
#include <math.h>
#include <vector>
//
using namespace std;


// Kernel function to do nested loops
__global__
void add(int max_x, int max_y, float *tot, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;
if(i < max_x && j<max_y) {
atomicAdd(tot, x[i] + y[j]);
}
}


int main(void)
{
int Nx = 1<<15;
int Ny = 1<<15;
float *d_x = NULL, *d_y = NULL;
float *d_tot = NULL;
cudaMalloc((void **)&d_x, sizeof(float)*Nx);
cudaMalloc((void **)&d_y, sizeof(float)*Ny);
cudaMalloc((void **)&d_tot, sizeof(float));

// Allocate Unified Memory – accessible from CPU or GPU
vector<float> vx;
vector<float> vy;

// initialize x and y arrays on the host
for (int i = 0; i < Nx; i++)
vx.push_back(i);

for (int i = 0; i < Ny; i++)
vy.push_back(i*10);

//
float tot = 0;
for(int i = 0; i<vx.size(); i++)
for(int j = 0; j<vy.size(); j++)
tot += vx[i] + vy[j];

cout<<"CPU: tot: "<<tot<<endl;


//
cudaMemcpy(d_x, vx.data(), vx.size()*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, vy.data(), vy.size()*sizeof(float), cudaMemcpyHostToDevice);


//
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, Nx+Ny);

//.. bx*by can not go beyond the blockSize, or hardware limit, which is 1024;
//.. bx*bx = blockSize && bx/by=Nx/Ny, solve the equation
int bx = sqrt(blockSize*Nx/(float)Ny);
int by = bx*Ny/(float)Nx;
dim3 blockSize_3D(bx, by);
dim3 gridSize_3D((Nx+bx-1)/bx, (Ny+by+1)/by);

cout<<"blockSize: "<<blockSize<<endl;
cout<<"bx: "<<bx<<" by: "<<by<<" gx: "<<gridSize_3D.x<<" gy: "<<gridSize_3D.y<<endl;

// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);

int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);

float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);

printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);


// Run kernel on 1M elements on the GPU
tot = 0;
add<<<gridSize_3D, blockSize_3D>>>(Nx, Ny, d_tot, d_x, d_y);

// Wait for GPU to finish before accessing on host
//cudaDeviceSynchronize();

//
cudaMemcpy(&tot, d_tot, sizeof(float), cudaMemcpyDeviceToHost);

//
cout<<" GPU: tot: "<<tot<<endl;

// Free memory
cudaFree(d_x);
cudaFree(d_y);
cudaFree(d_tot);

return 0;
}


//Code2.cu has the following output:
//
//CPU: tot: 8.79609e+12
//blockSize: 1024
//bx: 32 by: 32 gx: 1024 gy: 1025
//Launched blocks of size 1024. Theoretical occupancy: 1.000000
//GPU: tot: 0

删除cudaDeviceSynchronize()上的注释后,

GPU: tot: 8.79609e+12

最佳答案

CUDA 内核启动是异步的。这意味着它们的执行独立于启动它们的 CPU 线程。

由于这种异步启动,当您的 CPU 线程代码开始测试结果时,不能保证 CUDA 内核已完成(甚至启动)。

因此需要等待GPU内核完成,cudaDeviceSynchronize()正是这样做的。 cudaMemcpy还具有同步效果,因此当您删除cudaMemcpy时操作,您会失去同步,但是 cudaDeviceSynchronize()恢复它。

关于cudaMallocManaged 和 cudaDeviceSynchronize(),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58695262/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com