gpt4 book ai didi

cuda - 使用 CUDA : What is N? 的总和减少

转载 作者:行者123 更新时间:2023-12-04 11:58:28 24 4
gpt4 key购买 nike

据 NVIDIA 称,this是最快的求和核:

template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, unsigned int tid) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
template <unsigned int blockSize>
__global__ void reduce6(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

但是,我不明白“n”参数。有什么线索吗?我不认为要减少数组的大小,因为在 while 循环中会出现缓冲区溢出。

最佳答案

我相信您在幻灯片中发现了一个错字(它可能应该是类似 while(i + blockDim.x < n) 的内容)。

如果您查看 CUDA SDK 示例中的源代码 "reduction" ,最近的正文reduce6看起来像这样:

template <class T, unsigned int blockSize, bool nIsPow2>
__global__ void
reduce6(T *g_idata, T *g_odata, unsigned int n)
{
T *sdata = SharedMemory<T>();

// perform first level of reduction,
// reading from global memory, writing to shared memory
...

T mySum = 0;

// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n)
{
mySum += g_idata[i];
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < n)
mySum += g_idata[i+blockSize];
i += gridSize;
}

注意 while 中的显式检查这可以防止越界访问 g_idata .你最初的怀疑是正确的; n只是 g_idata 的大小大批。

关于cuda - 使用 CUDA : What is N? 的总和减少,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/8176488/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com