gpt4 book ai didi

linux - 使用 cuFFT 的多个主机线程的奇怪行为

转载 作者:塔克拉玛干 更新时间:2023-11-03 00:59:02 25 4
gpt4 key购买 nike

下面的测试程序为每个 GPU 创建了一个宿主线程。每个主机线程创建一个 cuFFT 计划并执行 FFT。

大多数时候,程序似乎运行无误。但是,有时它会以多种方式失败(请参见下面的示例输出)。有人知道为什么这个程序有时会失败吗?

我在一台有两个 Intel Xeon E5620 CPU、48 GB 主机内存和四个 Tesla C2075 卡的机器上运行这个程序,这些卡都没有用于显示。操作系统是Linux(Debian 7.4),我安装了CUDA 5.5版本。 NVIDIA 驱动程序版本为 319.82。作为引用,cuFFT 的错误代码 4 和 11 分别是 CUFFT_INVALID_VALUE 和 CUFFT_INVALID_DEVICE。

更新:我越来越怀疑 cuFFT 不是线程安全的,正如文档似乎声称的那样。如果我用互斥量保护整个线程(即一次只能有一个线程实际执行),程序就不会失败。仅使用互斥锁保护 cufftPlan1d 调用或仅保护 cufftExecR2C 调用会导致程序失败。我是否误解了文档:

Thread-safe API that can be called from multiple independent host threads

示例 1

4 CUDA device(s) found
Device 3 initialized
Device 2 initialized
Device 1 initialized
FFT execution failed for device 1, status = 11
Device 0 initialized
Device 3 deinitialized
Device 2 deinitialized
Device 0 deinitialized

请注意,设备 1 线程没有终止。

例子2

4 CUDA device(s) found
Device 0 initialized
Device 2 initialized
Device 1 initialized
Device 3 initialized
FFT execution failed for device 3, status = 11
Device 2 deinitialized
Device 0 deinitialized
Device 1 deinitialized

示例 3

4 CUDA device(s) found
Device 1 initialized
Device 2 initialized
FFT execution failed for device 2, status = 4
Device 1 deinitialized
Device 3 initialized
Device 0 initialized
FFT execution failed for device 0, status = 4
Device 3 deinitialized

例子4

4 CUDA device(s) found
Segmentation fault

示例 5

4 CUDA device(s) found
Device 3 initialized
Device 2 initialized
Device 3 deinitialized
Plan creation failed for device 0, status = 4
^C

在最后一个例子中,程序没有终止。

如果我同时运行这个程序的多个副本,使用我在 {0..9} 中;做 ./pthread_cuda 并完成,它以新的和有趣的方式失败了:

例子6

4 CUDA device(s) found
4 CUDA device(s) found
4 CUDA device(s) found
4 CUDA device(s) found
4 CUDA device(s) found
4 CUDA device(s) found
pthread_cuda: pthread_mutex_lock.c:84: __pthread_mutex_lock: Assertion `mutex->__data.__owner == 0' failed.
4 CUDA device(s) found
4 CUDA device(s) found
4 CUDA device(s) found

我在程序中没有使用互斥量,所以这个问题是cuFFT库中的错误吗?

pthread_cuda代码

#include <cuda_runtime_api.h>
#include <cufft.h>
#include <malloc.h>
#include <math.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// The thread configuration structure.
typedef struct
{
int device;
pthread_t thread;
cudaError_t status;
int np;
}
config_t;

// The size of memory region.
int np = 16384;

// The function executed by each thread assigned with CUDA device.
void *thread_func(void *arg)
{
// Unpack the config structure.
config_t *config = (config_t *)arg;

int device = config->device;

int fft_in_np = config->np;
int fft_out_np = config->np / 2 + 1;

// Set focus on device with the specificed index.
cudaError_t cuda_status = cudaSetDevice(device);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot set focus to device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}

cufftHandle r2c_plan;
cufftResult cufft_status = cufftPlan1d(&r2c_plan, fft_in_np, CUFFT_R2C, 1);
if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "Plan creation failed for device %d, status = %d\n",
device, cufft_status);
//config->status = cufft_status;
pthread_exit(NULL);
}

// Create device arrays for input and output data.
cufftReal *in_dev_data = NULL;
cufftComplex *out_dev_data = NULL;

cuda_status = cudaMalloc((void **)&in_dev_data, (fft_in_np + 2) * sizeof(cufftReal));
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot allocate CUDA FFT buffer on device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}

cuda_status = cudaMalloc((void **)&out_dev_data, fft_out_np * sizeof(cufftComplex));
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot allocate CUDA FFT buffer on device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}

printf("Device %d initialized\n", device);

//out_dev_data = (cufftComplex *)in_dev_data;

cufft_status = cufftExecR2C(r2c_plan, in_dev_data, out_dev_data);
if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "FFT execution failed for device %d, status = %d\n",
device, cufft_status);
//config->status = cuda_status;
pthread_exit(NULL);
}

cuda_status = cudaDeviceSynchronize();
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Failed to synchronize device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}

// Dispose device buffers.
cuda_status = cudaFree(in_dev_data);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot release input buffer on device %d, status = %d\n",
device, cuda_status);
config->status = cuda_status;
pthread_exit(NULL);
}

cufft_status = cufftDestroy(r2c_plan);
if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "Plan destruction failed for device %d, status = %d\n",
device, cufft_status);
//config->status = cuda_status;
pthread_exit(NULL);
}

printf("Device %d deinitialized\n", device);

config->status = 0;
return NULL;
}

int main(int argc, char* argv[])
{
int ndevices = 0;
cudaError_t cuda_status = cudaGetDeviceCount(&ndevices);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot get the cuda device count, status = %d\n",
cuda_status);
return cuda_status;
}

// Return if no cuda devices present.
printf("%d CUDA device(s) found\n", ndevices);
if (!ndevices)
return 0;

int dev_num;
cuda_status = cudaGetDevice(&dev_num);
if (cuda_status != cudaSuccess) {
fprintf(stderr, "Cannot get the cuda device number, status = %d\n",
cuda_status);
return cuda_status;
}

// Create workers configs. Its data will be passed as
// argument to thread_func.
config_t* configs = (config_t*)malloc(sizeof(config_t) * ndevices);

// For each CUDA device found create a separate thread
// and execute the thread_func.
for (int i = 0; i < ndevices; i++) {
config_t *config = configs + i;
config->device = i;
config->np = np;
//config->in_host = in + np * i;

int status = pthread_create(&config->thread, NULL, thread_func, config);
if (status) {
fprintf(stderr, "Cannot create thread for device %d, status = %d\n",
i, status);
return status;
}
}

// Wait for device threads completion.
// Check error status.
int status = 0;
for (int i = 0; i < ndevices; i++) {
pthread_join(configs[i].thread, NULL);
status += configs[i].status;
}
if (status)
return status;

free(configs);

return 0;
}

最佳答案

郑重声明,CUDA 6.0RC 似乎解决了这个问题。我怀疑它是由 cuFFT 库中的线程代码中的错误引起的,但我不能确定。

关于linux - 使用 cuFFT 的多个主机线程的奇怪行为,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/22501702/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com