cuda - __match_any_sync 在计算能力 6 上的替代方案是什么？-6ren

cuda - __match_any_sync 在计算能力 6 上的替代方案是什么？

转载作者：行者123 更新时间：2023-12-03 22:00:22

在 cuda 示例中，e.g. here , __match_all_sync __match_any_sync用来。

这是一个例子，其中一个经线被分成多个(一个或多个)组，每个组都跟踪自己的原子计数器。

// increment the value at ptr by 1 and return the old value
__device__ int atomicAggInc(int* ptr) {
    int pred;
    //const auto mask = __match_all_sync(__activemask(), ptr, &pred); //error, should be any_sync, not all_sync
    const auto mask = __match_any_sync(__activemask(), ptr, &pred);
    const auto leader = __ffs(mask) - 1;  // select a leader
    int res;
    const auto lane_id = ThreadId() % warpSize;
    if (lane_id == leader) {                 // leader does the update
        res = atomicAdd(ptr, __popc(mask));
    }
    res = __shfl_sync(mask, res, leader);    // get leader’s old value
    return res + __popc(mask & ((1 << lane_id) - 1)); //compute old value
}

__match_any_sync这里将经纱中的线程分成具有相同 ptr 的组。值，以便每个组可以自动更新自己的 ptr，而不会妨碍其他线程。

我知道 nvcc 编译器(从 cuda 9 开始)会自动在引擎盖下进行这种优化，但这只是关于 __match_any_sync 的机制。

有没有办法做到这种预计算能力7？

最佳答案

编辑:博客文章现已修改以反射(reflect) __match_any_sync()而不是 __match_all_sync() ，因此应忽略以下对此大意的任何评论。下面的答案经过编辑以反射(reflect)这一点。

根据你的说法:

this is just about the mechanics of __match_any_sync

我们将专注于替代 __match_any_sync本身，而不是任何其他形式的重写 atomicAggInc功能。因此，我们必须提供一个与 __match_any_sync() 返回的值相同的掩码。在 cc7.0 或更高的架构上。

我相信这将需要一个循环，它广播 ptr值，在最坏的情况下，对扭曲中的每个线程进行一次迭代(因为每个线程可能具有唯一的 ptr 值)并测试哪些线程具有相同的值。我们可以通过多种方式“优化”此函数的循环，以便根据实际 ptr 将行程计数从 32 减少到某个较小的值。每个线程中的值，但在我看来，这种优化引入了相当大的复杂性，这使得最坏情况的处理时间更长(这是早期退出优化的典型特征)。所以我将演示一个没有这种优化的相当简单的方法。

另一个考虑是在warp没有收敛的情况下怎么办？为此，我们可以雇用 __activemask()来识别这种情况。

这是一个工作示例:

$ cat t1646.cu
#include <iostream>
#include <stdio.h>

// increment the value at ptr by 1 and return the old value
__device__ int atomicAggInc(int* ptr) {
    int mask;
#if __CUDA_ARCH__ >= 700
    mask = __match_any_sync(__activemask(), (unsigned long long)ptr);
#else
    unsigned tmask = __activemask();
    for (int i = 0; i < warpSize; i++){
#ifdef USE_OPT
      if ((1U<<i) & tmask){
#endif
        unsigned long long tptr = __shfl_sync(tmask, (unsigned long long)ptr, i);
        unsigned my_mask = __ballot_sync(tmask, (tptr == (unsigned long long)ptr));
        if (i == (threadIdx.x & (warpSize-1))) mask = my_mask;}
#ifdef USE_OPT
      }
#endif
#endif
    int leader = __ffs(mask) - 1;  // select a leader
    int res;
    unsigned lane_id = threadIdx.x % warpSize;
    if (lane_id == leader) {                 // leader does the update
        res = atomicAdd(ptr, __popc(mask));
    }
    res = __shfl_sync(mask, res, leader);    // get leader’s old value
    return res + __popc(mask & ((1 << lane_id) - 1)); //compute old value
}



__global__ void k(int *d){

  int *ptr = d + threadIdx.x/4;
  if ((threadIdx.x >= 16) && (threadIdx.x < 32))
    atomicAggInc(ptr);
}

const int ds = 32;
int main(){

  int *d_d, *h_d;
  h_d = new int[ds];
  cudaMalloc(&d_d, ds*sizeof(d_d[0]));
  cudaMemset(d_d, 0, ds*sizeof(d_d[0]));
  k<<<1,ds>>>(d_d);
  cudaMemcpy(h_d, d_d, ds*sizeof(d_d[0]), cudaMemcpyDeviceToHost);
  for (int i = 0; i < ds; i++)
    std::cout << h_d[i] << " ";
  std::cout << std::endl;
}
$ nvcc -o t1646 t1646.cu -DUSE_OPT
$ cuda-memcheck ./t1646
========= CUDA-MEMCHECK
0 0 0 0 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
========= ERROR SUMMARY: 0 errors
$