gpt4 book ai didi

cuda - 在 CUDA 扭曲级别减少中删除 __syncthreads()

转载 作者:行者123 更新时间:2023-12-02 04:39:43 27 4
gpt4 key购买 nike

以下代码将数组中的每个 32 元素相加到每个 32 元素组的第一个元素:

int i = threadIdx.x;
int warpid = i&31;
if(warpid < 16){
s_buf[i] += s_buf[i+16];__syncthreads();
s_buf[i] += s_buf[i+8];__syncthreads();
s_buf[i] += s_buf[i+4];__syncthreads();
s_buf[i] += s_buf[i+2];__syncthreads();
s_buf[i] += s_buf[i+1];__syncthreads();
}

我认为我可以消除代码中的所有__syncthreads(),因为所有操作都是在同一个扭曲中完成的。但如果我消除它们,我会得到垃圾结果。它不会对性能产生太大影响,但我想知道为什么这里需要 __syncthreads()

最佳答案

我在这里提供答案是因为我认为以上两个并不完全令人满意。这个答案的“知识产权”属于Mark Harris,他在这个presentation中指出了这个问题。 (幻灯片 22),以及 @talonmies,他在上面的评论中向 OP 指出了这个问题。

让我首先尝试恢复OP所询问的内容,过滤他的错误。

OP 似乎正在处理减少共享内存的最后一步,即通过循环展开来减少扭曲。他正在做类似的事情

template <class T>
__device__ void warpReduce(T *sdata, int tid) {
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}

template <class T>
__global__ void reduce4_no_synchthreads(T *g_idata, T *g_odata, unsigned int N)
{
extern __shared__ T sdata[];

unsigned int tid = threadIdx.x; // Local thread index
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; // Global thread index - Fictitiously double the block dimension

// --- Performs the first level of reduction in registers when reading from global memory.
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i+blockDim.x];
sdata[tid] = mySum;

// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();

// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s=blockDim.x/2; s>32; s>>=1)
{
if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}

// --- Single warp reduction by loop unrolling. Assuming blockDim.x >64
if (tid < 32) warpReduce(sdata, tid);

// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

正如 Mark Harris 和 talonmies 所指出的,共享内存变量 sdata 必须声明为 volatile ,以防止编译器优化。因此,定义上面的__device__函数的正确方法是:

template <class T>
__device__ void warpReduce(volatile T *sdata, int tid) {
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}

现在让我们看看与上述两种情况相对应的反汇编代码,即 sdata 声明为非 volatile volatile (代码为费米架构编译)。

volatile

    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ SHL R3, R0, 0x1; /* 0x6000c0000400dc03 */
/*0018*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0020*/ IMAD R3, R3, c[0x0][0x8], R2; /* 0x200440002030dca3 */
/*0028*/ IADD R4, R3, c[0x0][0x8]; /* 0x4800400020311c03 */
/*0030*/ ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT; /* 0x188e4000a031dc03 */
/*0038*/ ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT; /* 0x1b0e4000a043dc03 */
/*0040*/ @P0 ISCADD R3, R3, c[0x0][0x20], 0x2; /* 0x400040008030c043 */
/*0048*/ @!P1 ISCADD R4, R4, c[0x0][0x20], 0x2; /* 0x4000400080412443 */
/*0050*/ @!P0 MOV R5, RZ; /* 0x28000000fc0161e4 */
/*0058*/ @!P1 LD R4, [R4]; /* 0x8000000000412485 */
/*0060*/ @P0 LD R5, [R3]; /* 0x8000000000314085 */
/*0068*/ SHL R3, R2, 0x2; /* 0x6000c0000820dc03 */
/*0070*/ NOP; /* 0x4000000000001de4 */
/*0078*/ @!P1 IADD R5, R4, R5; /* 0x4800000014416403 */
/*0080*/ MOV R4, c[0x0][0x8]; /* 0x2800400020011de4 */
/*0088*/ STS [R3], R5; /* 0xc900000000315c85 */
/*0090*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0098*/ MOV R6, c[0x0][0x8]; /* 0x2800400020019de4 */
/*00a0*/ ISETP.LT.U32.AND P0, PT, R6, 0x42, PT; /* 0x188ec0010861dc03 */
/*00a8*/ @P0 BRA 0x118; /* 0x40000001a00001e7 */
/*00b0*/ NOP; /* 0x4000000000001de4 */
/*00b8*/ NOP; /* 0x4000000000001de4 */
/*00c0*/ MOV R6, R4; /* 0x2800000010019de4 */
/*00c8*/ SHR.U32 R4, R4, 0x1; /* 0x5800c00004411c03 */
/*00d0*/ ISETP.GE.U32.AND P0, PT, R2, R4, PT; /* 0x1b0e00001021dc03 */
/*00d8*/ @!P0 IADD R7, R4, R2; /* 0x480000000841e003 */
/*00e0*/ @!P0 SHL R7, R7, 0x2; /* 0x6000c0000871e003 */
/*00e8*/ @!P0 LDS R7, [R7]; /* 0xc10000000071e085 */
/*00f0*/ @!P0 IADD R5, R7, R5; /* 0x4800000014716003 */
/*00f8*/ @!P0 STS [R3], R5; /* 0xc900000000316085 */
/*0100*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0108*/ ISETP.GT.U32.AND P0, PT, R6, 0x83, PT; /* 0x1a0ec0020c61dc03 */
/*0110*/ @P0 BRA 0xc0; /* 0x4003fffea00001e7 */
/*0118*/ ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT; /* 0x1a0ec0007c21dc03 */
/*0120*/ @P0 BRA.U 0x198; /* 0x40000001c00081e7 */
/*0128*/ @!P0 LDS R8, [R3]; /* 0xc100000000322085 */
/*0130*/ @!P0 LDS R5, [R3+0x80]; /* 0xc100000200316085 */
/*0138*/ @!P0 LDS R4, [R3+0x40]; /* 0xc100000100312085 */
/*0140*/ @!P0 LDS R7, [R3+0x20]; /* 0xc10000008031e085 */
/*0148*/ @!P0 LDS R6, [R3+0x10]; /* 0xc10000004031a085 */
/*0150*/ @!P0 IADD R8, R8, R5; /* 0x4800000014822003 */
/*0158*/ @!P0 IADD R8, R8, R4; /* 0x4800000010822003 */
/*0160*/ @!P0 LDS R5, [R3+0x8]; /* 0xc100000020316085 */
/*0168*/ @!P0 IADD R7, R8, R7; /* 0x480000001c81e003 */
/*0170*/ @!P0 LDS R4, [R3+0x4]; /* 0xc100000010312085 */
/*0178*/ @!P0 IADD R6, R7, R6; /* 0x480000001871a003 */
/*0180*/ @!P0 IADD R5, R6, R5; /* 0x4800000014616003 */
/*0188*/ @!P0 IADD R4, R5, R4; /* 0x4800000010512003 */
/*0190*/ @!P0 STS [R3], R4; /* 0xc900000000312085 */
/*0198*/ ISETP.NE.AND P0, PT, R2, RZ, PT; /* 0x1a8e0000fc21dc23 */
/*01a0*/ @P0 BRA.U 0x1c0; /* 0x40000000600081e7 */
/*01a8*/ @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2; /* 0x4000400090002043 */
/*01b0*/ @!P0 LDS R2, [RZ]; /* 0xc100000003f0a085 */
/*01b8*/ @!P0 ST [R0], R2; /* 0x900000000000a085 */
/*01c0*/ EXIT; /* 0x8000000000001de7 */

/*0128*/-/*0148*//*0160*//*0170*/ 对应于共享内存加载到寄存器,并将 /*0190*/ 行从寄存器加载到共享内存存储。中间行对应于寄存器中执行的求和。因此,中间结果保存在寄存器(每个线程私有(private))中,并且不会每次刷新到共享内存,从而防止线程完全了解中间结果。

易变

    /*0000*/         MOV R1, c[0x1][0x100];                          /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ SHL R3, R0, 0x1; /* 0x6000c0000400dc03 */
/*0018*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0020*/ IMAD R3, R3, c[0x0][0x8], R2; /* 0x200440002030dca3 */
/*0028*/ IADD R4, R3, c[0x0][0x8]; /* 0x4800400020311c03 */
/*0030*/ ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT; /* 0x188e4000a031dc03 */
/*0038*/ ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT; /* 0x1b0e4000a043dc03 */
/*0040*/ @P0 ISCADD R3, R3, c[0x0][0x20], 0x2; /* 0x400040008030c043 */
/*0048*/ @!P1 ISCADD R4, R4, c[0x0][0x20], 0x2; /* 0x4000400080412443 */
/*0050*/ @!P0 MOV R5, RZ; /* 0x28000000fc0161e4 */
/*0058*/ @!P1 LD R4, [R4]; /* 0x8000000000412485 */
/*0060*/ @P0 LD R5, [R3]; /* 0x8000000000314085 */
/*0068*/ SHL R3, R2, 0x2; /* 0x6000c0000820dc03 */
/*0070*/ NOP; /* 0x4000000000001de4 */
/*0078*/ @!P1 IADD R5, R4, R5; /* 0x4800000014416403 */
/*0080*/ MOV R4, c[0x0][0x8]; /* 0x2800400020011de4 */
/*0088*/ STS [R3], R5; /* 0xc900000000315c85 */
/*0090*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0098*/ MOV R6, c[0x0][0x8]; /* 0x2800400020019de4 */
/*00a0*/ ISETP.LT.U32.AND P0, PT, R6, 0x42, PT; /* 0x188ec0010861dc03 */
/*00a8*/ @P0 BRA 0x118; /* 0x40000001a00001e7 */
/*00b0*/ NOP; /* 0x4000000000001de4 */
/*00b8*/ NOP; /* 0x4000000000001de4 */
/*00c0*/ MOV R6, R4; /* 0x2800000010019de4 */
/*00c8*/ SHR.U32 R4, R4, 0x1; /* 0x5800c00004411c03 */
/*00d0*/ ISETP.GE.U32.AND P0, PT, R2, R4, PT; /* 0x1b0e00001021dc03 */
/*00d8*/ @!P0 IADD R7, R4, R2; /* 0x480000000841e003 */
/*00e0*/ @!P0 SHL R7, R7, 0x2; /* 0x6000c0000871e003 */
/*00e8*/ @!P0 LDS R7, [R7]; /* 0xc10000000071e085 */
/*00f0*/ @!P0 IADD R5, R7, R5; /* 0x4800000014716003 */
/*00f8*/ @!P0 STS [R3], R5; /* 0xc900000000316085 */
/*0100*/ BAR.RED.POPC RZ, RZ, RZ, PT; /* 0x50ee0000ffffdc04 */
/*0108*/ ISETP.GT.U32.AND P0, PT, R6, 0x83, PT; /* 0x1a0ec0020c61dc03 */
/*0110*/ @P0 BRA 0xc0; /* 0x4003fffea00001e7 */
/*0118*/ ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT; /* 0x1a0ec0007c21dc03 */
/*0120*/ SSY 0x1f0; /* 0x6000000320000007 */
/*0128*/ @P0 NOP.S; /* 0x40000000000001f4 */
/*0130*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*0138*/ LDS R4, [R3+0x80]; /* 0xc100000200311c85 */
/*0140*/ IADD R6, R5, R4; /* 0x4800000010519c03 */
/*0148*/ STS [R3], R6; /* 0xc900000000319c85 */
/*0150*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*0158*/ LDS R4, [R3+0x40]; /* 0xc100000100311c85 */
/*0160*/ IADD R6, R5, R4; /* 0x4800000010519c03 */
/*0168*/ STS [R3], R6; /* 0xc900000000319c85 */
/*0170*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*0178*/ LDS R4, [R3+0x20]; /* 0xc100000080311c85 */
/*0180*/ IADD R6, R5, R4; /* 0x4800000010519c03 */
/*0188*/ STS [R3], R6; /* 0xc900000000319c85 */
/*0190*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*0198*/ LDS R4, [R3+0x10]; /* 0xc100000040311c85 */
/*01a0*/ IADD R6, R5, R4; /* 0x4800000010519c03 */
/*01a8*/ STS [R3], R6; /* 0xc900000000319c85 */
/*01b0*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*01b8*/ LDS R4, [R3+0x8]; /* 0xc100000020311c85 */
/*01c0*/ IADD R6, R5, R4; /* 0x4800000010519c03 */
/*01c8*/ STS [R3], R6; /* 0xc900000000319c85 */
/*01d0*/ LDS R5, [R3]; /* 0xc100000000315c85 */
/*01d8*/ LDS R4, [R3+0x4]; /* 0xc100000010311c85 */
/*01e0*/ IADD R4, R5, R4; /* 0x4800000010511c03 */
/*01e8*/ STS.S [R3], R4; /* 0xc900000000311c95 */
/*01f0*/ ISETP.NE.AND P0, PT, R2, RZ, PT; /* 0x1a8e0000fc21dc23 */
/*01f8*/ @P0 BRA.U 0x218; /* 0x40000000600081e7 */
/*0200*/ @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2; /* 0x4000400090002043 */
/*0208*/ @!P0 LDS R2, [RZ]; /* 0xc100000003f0a085 */
/*0210*/ @!P0 ST [R0], R2; /* 0x900000000000a085 */
/*0218*/ EXIT; /* 0x8000000000001de7 */

/*0130*/-/*01e8*/行可以看出,现在每次执行求和时,中间结果都会立即刷新到共享内存以实现完整的线程可见性.

关于cuda - 在 CUDA 扭曲级别减少中删除 __syncthreads(),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/10729185/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com