gpt4 book ai didi

CUDA:对无符号字符的原子操作

转载 作者:行者123 更新时间:2023-12-02 02:27:14 28 4
gpt4 key购买 nike

我是 CUDA 初学者。我在全局内存中有一个无符号字符的像素缓冲区,可以并且由任何和所有线程更新。因此,为了避免像素值出现异常,我想在线程尝试更新一个时执行 atomicExch。但是编程指南说这个函数只适用于 32 位或 64 位字,而我只想以原子方式交换一个 8 位字节。有没有办法做到这一点?

谢谢。

最佳答案

我最近刚遇到这个问题。理论上,原子操作/乐观重试应该比锁/互斥锁更快,因此对其他数据类型使用原子操作的“hack”解决方案对我来说似乎比使用临界区更好。

以下是一些基于 how to implement atomicMin for char 线程的实现和 atomicAdd for short .

我已经测试了所有这些,我的测试似乎表明它们到目前为止工作正常。

用于字符的 atomicAdd 版本 1

__device__ static inline char atomicAdd(char* address, char val) {
// offset, in bytes, of the char* address within the 32-bit address of the space that overlaps it
size_t long_address_modulo = (size_t) address & 3;
// the 32-bit address that overlaps the same memory
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
// A 0x3210 selector in __byte_perm will simply select all four bytes in the first argument in the same order.
// The "4" signifies the position where the first byte of the second argument will end up in the output.
unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};
// for selecting bytes within a 32-bit chunk that correspond to the char* address (relative to base_address)
unsigned int selector = selectors[long_address_modulo];
unsigned int long_old, long_assumed, long_val, replacement;

long_old = *base_address;

do {
long_assumed = long_old;
// replace bits in long_old that pertain to the char address with those from val
long_val = __byte_perm(long_old, 0, long_address_modulo) + val;
replacement = __byte_perm(long_old, long_val, selector);
long_old = atomicCAS(base_address, long_assumed, replacement);
} while (long_old != long_assumed);
return __byte_perm(long_old, 0, long_address_modulo);
}

用于字符的 atomicCAS
__device__ static inline char atomicCAS(char* address, char expected, char desired) {
size_t long_address_modulo = (size_t) address & 3;
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
unsigned int selectors[] = {0x3214, 0x3240, 0x3410, 0x4210};

unsigned int sel = selectors[long_address_modulo];
unsigned int long_old, long_assumed, long_val, replacement;
char old;

long_val = (unsigned int) desired;
long_old = *base_address;
do {
long_assumed = long_old;
replacement = __byte_perm(long_old, long_val, sel);
long_old = atomicCAS(base_address, long_assumed, replacement);
old = (char) ((long_old >> (long_address_modulo * 8)) & 0x000000ff);
} while (expected == old && long_assumed != long_old);

return old;
}

用于 char 的 atomicAdd 版本 2(使用位移而不是 __byte_perm 并且因此必须处理溢出)
__device__ static inline char atomicAdd2(char* address, char val) {
size_t long_address_modulo = (size_t) address & 3;
auto* base_address = (unsigned int*) ((char*) address - long_address_modulo);
unsigned int long_val = (unsigned int) val << (8 * long_address_modulo);
unsigned int long_old = atomicAdd(base_address, long_val);

if (long_address_modulo == 3) {
// the first 8 bits of long_val represent the char value,
// hence the first 8 bits of long_old represent its previous value.
return (char) (long_old >> 24);
} else {
// bits that represent the char value within long_val
unsigned int mask = 0x000000ff << (8 * long_address_modulo);
unsigned int masked_old = long_old & mask;
// isolate the bits that represent the char value within long_old, add the long_val to that,
// then re-isolate by excluding bits that represent the char value
unsigned int overflow = (masked_old + long_val) & ~mask;
if (overflow) {
atomicSub(base_address, overflow);
}
return (char) (masked_old >> 8 * long_address_modulo);
}
}

对于 atomicMin,请查看 this thread .

关于CUDA:对无符号字符的原子操作,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/5447570/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com