c++ - CUDA 多线程 : __threadfence not preventing multiple threads from accessing resource-6ren

c++ - CUDA 多线程 : __threadfence not preventing multiple threads from accessing resource

转载作者：行者123 更新时间：2023-11-28 07:20:03

所以，我最近开始了 CUDA 编程。

我尝试制作一个启动多个线程的程序，进入一个全局内存数组和启动顺序。

不过，部分独占控件好像不太好用。
我想防止多个线程同时访问数组Log。

现在数组Log是这样的

Log[0]=160
Log[1]=128
Log[2]=256
Log[3]=96
Log[4]=0
Log[5]=0
Log[6]=0
...etc

我想防止多个线程同时访问内存数组Log的独占控制

是不是做错了怎么用“__threadfence()”的？
我使用的是 CUDA5.5，计算能力是 2.1。
请建议某人。

以下为源码。

#include <cuda_runtime.h>
#include <stdio.h>  
#include <cuda.h> 
#include <cstdio>
#include <thrust/device_ptr.h>
#define N 256

//Prototype declaration
__global__ void CudaThreadfenceTest(int *Log_d);

int main(){
    int i,j;
    int Log[N];
    int *Log_d;
    //
    for(j=0;j<N;j++){
        Log[j]=0;
    }
    // GPU memory hold
    cudaMalloc((void**)&Log_d, N*sizeof(int));
    // host→device
    cudaMemcpy(Log_d,Log,N*sizeof(int),cudaMemcpyHostToDevice);
    /*****************
    *block & thread
    ******************/
    dim3 blocks(1,1,1);
    dim3 threads(256,1,1);

    //run kernel
    CudaThreadfenceTest<<<blocks,threads>>>(Log_d);
    cudaDeviceSynchronize();

    cudaMemcpy(Log,Log_d,N*sizeof(int),cudaMemcpyDeviceToHost); 
    for(j=0;j<N;j++){
        printf("Log[ %d ]=%d \n",j,Log[j]);
    }
    getchar();
    cudaFree(Log_d);
    return 0;
}


/*************************
/* kernel
/*************************/
__global__ void CudaThreadfenceTest(int *Log_d){

    printf("threadIdx.x = %d , \n",threadIdx.x);
    __threadfence();
    //for Log
    for(int j=0;j<N;j++){
        if(Log_d[j]==0){
            Log_d[j]=threadIdx.x + 1;
            break;
        }
    }
}

最佳答案

threadfence() 本身不能用于保护对内存区域的访问。它不会“隔离线程”，它实际上与更新内存有关。文档是 here

你想要的是 atomics (例如，您的示例可以与 atomicCAS 一起使用)或 critical section .

这是使用原子学重新编写的示例:

程序:

$ cat t258.cu
#include <stdio.h>
#include <cstdio>
#define N 256

//Prototype declaration
__global__ void atomicsTest(int *);

int main(){
    int j;
    int Log[N];
    int *Log_d;
    //
    for(j=0;j<N;j++){
        Log[j]=0;
    }
    // GPU memory hold
    cudaMalloc((void**)&Log_d, N*sizeof(int));
    // host.device
    cudaMemcpy(Log_d,Log,N*sizeof(int),cudaMemcpyHostToDevice);
    /*****************
    *block & thread
    ******************/
    dim3 blocks(1,1,1);
    dim3 threads(256,1,1);

    //run kernel
    atomicsTest<<<blocks,threads>>>(Log_d);
    cudaMemcpy(Log,Log_d,N*sizeof(int),cudaMemcpyDeviceToHost);
    for(j=0;j<N;j++){
        printf("Log[ %3d ]=%3d ",j,Log[j]);
        if (!((j+1)%4)) printf("\n");
    }
    getchar();
    cudaFree(Log_d);
    return 0;
}


__global__ void atomicsTest(int *Log_d){

     // printf("threadIdx.x = %d , \n",threadIdx.x);
    for (int j = 0; j < N; j++)
      if(atomicCAS(Log_d+j, 0, threadIdx.x + 1)==0)    break;
}

编译:

$ nvcc -arch=sm_20 -o t258 t258.cu

输出:

$ ./t258
Log[   0 ]=  1 Log[   1 ]=161 Log[   2 ]=162 Log[   3 ]=163
Log[   4 ]=164 Log[   5 ]=165 Log[   6 ]=166 Log[   7 ]=167
Log[   8 ]=168 Log[   9 ]=169 Log[  10 ]=  2 Log[  11 ]=  3
Log[  12 ]=  4 Log[  13 ]=  5 Log[  14 ]=170 Log[  15 ]=171
Log[  16 ]=172 Log[  17 ]=  6 Log[  18 ]=173 Log[  19 ]=174
Log[  20 ]=175 Log[  21 ]=176 Log[  22 ]=177 Log[  23 ]=178
Log[  24 ]=179 Log[  25 ]=180 Log[  26 ]=181 Log[  27 ]=182
Log[  28 ]=183 Log[  29 ]=184 Log[  30 ]=185 Log[  31 ]=186
Log[  32 ]=187 Log[  33 ]=188 Log[  34 ]=189 Log[  35 ]=190
Log[  36 ]=191 Log[  37 ]=192 Log[  38 ]=  7 Log[  39 ]=  8
Log[  40 ]=  9 Log[  41 ]= 10 Log[  42 ]= 11 Log[  43 ]= 12
Log[  44 ]= 13 Log[  45 ]= 14 Log[  46 ]= 15 Log[  47 ]= 16
Log[  48 ]= 17 Log[  49 ]= 18 Log[  50 ]= 19 Log[  51 ]= 20
Log[  52 ]= 21 Log[  53 ]= 22 Log[  54 ]= 23 Log[  55 ]= 24
Log[  56 ]= 25 Log[  57 ]= 26 Log[  58 ]= 27 Log[  59 ]= 28
Log[  60 ]= 29 Log[  61 ]= 30 Log[  62 ]= 31 Log[  63 ]= 32
Log[  64 ]= 33 Log[  65 ]= 34 Log[  66 ]= 35 Log[  67 ]= 36
Log[  68 ]= 37 Log[  69 ]= 38 Log[  70 ]= 39 Log[  71 ]= 40
Log[  72 ]= 41 Log[  73 ]= 42 Log[  74 ]= 43 Log[  75 ]= 44
Log[  76 ]= 45 Log[  77 ]= 46 Log[  78 ]= 47 Log[  79 ]= 48
Log[  80 ]= 49 Log[  81 ]= 50 Log[  82 ]= 51 Log[  83 ]= 52
Log[  84 ]= 53 Log[  85 ]= 54 Log[  86 ]= 55 Log[  87 ]= 56
Log[  88 ]= 57 Log[  89 ]= 58 Log[  90 ]= 59 Log[  91 ]= 60
Log[  92 ]= 61 Log[  93 ]= 62 Log[  94 ]= 63 Log[  95 ]= 64
Log[  96 ]=225 Log[  97 ]=226 Log[  98 ]=227 Log[  99 ]=228
Log[ 100 ]=229 Log[ 101 ]=230 Log[ 102 ]=231 Log[ 103 ]=232
Log[ 104 ]=233 Log[ 105 ]=234 Log[ 106 ]=235 Log[ 107 ]=236
Log[ 108 ]=237 Log[ 109 ]=238 Log[ 110 ]=239 Log[ 111 ]=240
Log[ 112 ]=241 Log[ 113 ]=242 Log[ 114 ]=243 Log[ 115 ]=244
Log[ 116 ]=245 Log[ 117 ]=246 Log[ 118 ]=247 Log[ 119 ]=248
Log[ 120 ]=249 Log[ 121 ]=250 Log[ 122 ]=251 Log[ 123 ]=252
Log[ 124 ]=253 Log[ 125 ]=254 Log[ 126 ]=255 Log[ 127 ]=256
Log[ 128 ]= 97 Log[ 129 ]= 98 Log[ 130 ]= 99 Log[ 131 ]=100
Log[ 132 ]=101 Log[ 133 ]=102 Log[ 134 ]=103 Log[ 135 ]=104
Log[ 136 ]=105 Log[ 137 ]=106 Log[ 138 ]=107 Log[ 139 ]=108
Log[ 140 ]=109 Log[ 141 ]=110 Log[ 142 ]=111 Log[ 143 ]=112
Log[ 144 ]=113 Log[ 145 ]=114 Log[ 146 ]=115 Log[ 147 ]=116
Log[ 148 ]=117 Log[ 149 ]=118 Log[ 150 ]=119 Log[ 151 ]=120
Log[ 152 ]=121 Log[ 153 ]=122 Log[ 154 ]=123 Log[ 155 ]=124
Log[ 156 ]=125 Log[ 157 ]=126 Log[ 158 ]=127 Log[ 159 ]=128
Log[ 160 ]=129 Log[ 161 ]=130 Log[ 162 ]=131 Log[ 163 ]=132
Log[ 164 ]=133 Log[ 165 ]=134 Log[ 166 ]=135 Log[ 167 ]=136
Log[ 168 ]=137 Log[ 169 ]=138 Log[ 170 ]=139 Log[ 171 ]=140
Log[ 172 ]=141 Log[ 173 ]=142 Log[ 174 ]=143 Log[ 175 ]=144
Log[ 176 ]=145 Log[ 177 ]=146 Log[ 178 ]=147 Log[ 179 ]=148
Log[ 180 ]=149 Log[ 181 ]=150 Log[ 182 ]=151 Log[ 183 ]=152
Log[ 184 ]=153 Log[ 185 ]=154 Log[ 186 ]=155 Log[ 187 ]=156
Log[ 188 ]=157 Log[ 189 ]=158 Log[ 190 ]=159 Log[ 191 ]=160
Log[ 192 ]= 65 Log[ 193 ]=193 Log[ 194 ]=194 Log[ 195 ]=195
Log[ 196 ]=196 Log[ 197 ]=197 Log[ 198 ]=198 Log[ 199 ]=199
Log[ 200 ]=200 Log[ 201 ]=201 Log[ 202 ]=202 Log[ 203 ]=203
Log[ 204 ]=204 Log[ 205 ]=205 Log[ 206 ]=206 Log[ 207 ]=207
Log[ 208 ]=208 Log[ 209 ]=209 Log[ 210 ]=210 Log[ 211 ]=211
Log[ 212 ]=212 Log[ 213 ]=213 Log[ 214 ]=214 Log[ 215 ]=215
Log[ 216 ]=216 Log[ 217 ]=217 Log[ 218 ]=218 Log[ 219 ]=219
Log[ 220 ]=220 Log[ 221 ]=221 Log[ 222 ]=222 Log[ 223 ]=223
Log[ 224 ]=224 Log[ 225 ]= 66 Log[ 226 ]= 67 Log[ 227 ]= 68
Log[ 228 ]= 69 Log[ 229 ]= 70 Log[ 230 ]= 71 Log[ 231 ]= 72
Log[ 232 ]= 73 Log[ 233 ]= 74 Log[ 234 ]= 75 Log[ 235 ]= 76
Log[ 236 ]= 77 Log[ 237 ]= 78 Log[ 238 ]= 79 Log[ 239 ]= 80
Log[ 240 ]= 81 Log[ 241 ]= 82 Log[ 242 ]= 83 Log[ 243 ]= 84
Log[ 244 ]= 85 Log[ 245 ]= 86 Log[ 246 ]= 87 Log[ 247 ]= 88
Log[ 248 ]= 89 Log[ 249 ]= 90 Log[ 250 ]= 91 Log[ 251 ]= 92
Log[ 252 ]= 93 Log[ 253 ]= 94 Log[ 254 ]= 95 Log[ 255 ]= 96

$

关于c++ - CUDA 多线程 : __threadfence not preventing multiple threads from accessing resource，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/19642047/

文章推荐： c++ - 在 visual studio 中使用 calcOpticalFlowPyrLK() 函数

文章推荐： ios - 将 subview 定位在圆形 View 的边缘

文章推荐： javascript - 附加 HTML 和 PHP 代码

C#基础多线程问题: Call Method on Thread A from Thread B (Thread B started from Thread A)
完成此任务的最佳方法是什么:主线程(线程 A)创建另外两个线程(线程 B 和线程 C)。线程 B 和 C 执行繁重的磁盘 I/O，最终需要将它们创建的资源传递给线程 A，然后调用外部 DLL 文件中的
multithreading - Threads.@spawn 和 Threads.@threads 有什么区别？
我是一名对 Julia 语言感兴趣的新手程序员。文档( https://docs.julialang.org/en/v1/base/multi-threading/ )说 Threads.@threa
python - thread.start_new_thread 与 threading.Thread.start
python中的thread.start_new_thread和threading.Thread.start有什么区别？我注意到，当调用 start_new_thread 时，新线程会在调用线程终止
安卓蓝牙 : A thread started from UI thread blocks the UI thread
我正在学习安卓蓝牙编程。我从 Google 的 Android 开发者网站上复制了大部分代码以供学习。这个想法是监听服务器上的连接是在一个新线程中完成的，而不会阻塞 UI 线程。当收到连接请求时，连接
Java多线程: Does the thread on which an objects method is executed depend on the thread on the thread in which it is created?
执行对象方法的线程是否依赖于创建它的线程上的线程？假设您的 java 应用程序中有两个线程 Thread1 和 Thread2，以及两个类 ClassA 和 ClassB。您在 Thread1 上
C++11 std::thread 给出错误:没有匹配的函数来调用 std::thread::thread
我正在用这段代码测试 C++11 线程，但是在创建线程时，我遇到了错误没有匹配函数调用 'std::thread::thread()'. 这就像我给 std::thread ctr 的函数有什么问题，
c++ - 使用已删除的函数 'std::thread::thread(const std::thread&)'
我有如下类 eventEngine 和网关: class eventEngine { public: eventEngine(); std::thread threa; std
python - "RuntimeError: thread.__init__() not called"子类化 threading.Thread 时
我需要运行与列表 dirlist 中的元素一样多的 Observer 类线程。当我运行它 python 控制台时，它可以正常工作。 class Observer(Thread): def ru
java - 在对 Thread.currentThread(); 的方法调用中，Thread 指的是什么？和 Thread.sleep();？
我在一本 Java 书中读到了下面的代码。我知道主类默认继承 Thread 类，所以 currentThread();而不是 Thread.currentThread();也会做这项工作。但我不明白
java - 守护线程 : Is it possible to change a running thread from user thread to daemon thread?
我在我的系统中使用第 3 方 API，该 API 启动一个永久运行的用户线程。一旦我的程序结束，JVM 由于该线程而继续运行，因此我尝试获取此线程引用并通过更改它 thread.setDaemon(t
python - 为什么 super(Thread, self).__init__() 不能用于 threading.Thread 子类？
我所知道的 Python 中的每个对象都可以通过调用来处理其基类初始化: super(BaseClass, self).__init__() threading.Thread 的子类似乎不是这种情况，
c# - Xamarin - Java.Lang.Thread 与 System.Threading.Thread - 使用哪一个？
在我最近从事的 Xamarin 项目中，我可以看到开发人员使用了 Java.Lang.Thread 以及 System.Threading.Thread(用于非常相似的操作 - 例如在后台加载数据)。
Julia Threads.@threads 在一个简单的例子中不起作用
我在 Julia 中运行双循环。代码非常简单。 w = rand(1000,1000) function regular_demo(w::Array{Float64, 2}) n = size
multithreading - 将参数传递给 threading.Thread
我在 Windows 上使用 Python 3。我正在使用 threading.Thread动态运行一个函数，我可以带参数或不带参数调用它。我正在设置一个列表，其中的第一项是定义路径的字符串。其他参数
python - threading.Thread 中的流控制
我遇到了一些使用线程模块(使用 Python 2.6)管理线程的示例。我想了解的是这个例子是如何调用“运行”方法的，在哪里调用的。我在任何地方都看不到它。 ThreadUrl 类在 main() 函
Python threading.Thread、范围和垃圾收集
假设我从 threading.Thread 派生: from threading import Thread class Worker(Thread): def start(self):
python - 'threading' 对象没有属性 'Thread'
关闭。这个问题需要debugging details .它目前不接受答案。编辑问题以包含 desired behavior, a specific problem or error, and th
.net - WinDbg/SOS : How to correlate managed threads from ! 带有 System.Threading.Thread 实例的线程命令
使用 WinDbg 和 SOS，我有以下内容: 0:011> !threads ThreadCount: 7 UnstartedThread: 0 BackgroundThread: 4 Pendin
java - 谷歌应用引擎错误 : Fetch in a thread that is neither the original request thread nor a thread created by ThreadManager
App Engine 给出错误: com.google.apphosting.api.ApiProxy$CallNotFoundException: Can't make API call urlfe
java - "Thread-19"java.lang.IllegalStateException : Not on FX application thread; currentThread = Thread-19
我正在尝试将 Swing JEditorPane 嵌入到 JavaFX 项目中，如下代码所示。 Platform.runLater(() -> { SyntaxTester ob = new

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c++ - CUDA 多线程 : __threadfence not preventing multiple threads from accessing resource