gpt4 book ai didi

在 CUDA 中对结构数组进行排序

转载 作者:行者123 更新时间:2023-12-02 21:26:56 25 4
gpt4 key购买 nike

我有一台配备 NVIDIA GT750M 4Gb(计算能力 3.0)显卡的笔记本电脑。我需要对 CUDA 上的结构数组(大约 3 × 10^7 元素)进行排序。但我不知道怎么做,因为我在 CUDA 方面没有足够的经验。使用 thrust::sort 时,我得到了奇怪的结果(需要几十分钟,而 std::sort 需要 1 分钟)。

struct MyStruct
{
float key;
float a;
float b;
int c;
int d;
int e;
int f;
bool flag;
}
bool minCompare(const MyStruct lhs, const MyStruct rhs)
{
return lhs.key < rhs.key;
}

最佳答案

正如罗伯特·克罗维拉(Robert Crovella)在评论中指出的那样,几十分钟很可能意味着你做错了什么。我在下面提供了一个示例,其中比较使用 thrust::sortthrust::sort_by_key 对结构数组 (AoS) 和数组结构 (SoA) 进行排序的性能。我在笔记本电脑 GeForce GT 540M 上运行并使用 CUDA 5.5 进行编译,因此您拥有比我的更强大的卡。对于 100000 元素,两种情况下的执行时间均为秒级。正如我在评论中指出的,第一种情况对计算时间 (1675ms) 的要求比第二种情况 (668.9ms) 更高。

#include <thrust\device_vector.h>
#include <thrust\sort.h>

struct MyStruct1
{
int key;
int value1;
int value2;
};

struct MyStruct2
{
int N;
int* key;
int* value1;
int* value2;

MyStruct2(int N_) {
N = N_;
cudaMalloc((void**)&key,N*sizeof(int));
cudaMalloc((void**)&value1,N*sizeof(int));
cudaMalloc((void**)&value2,N*sizeof(int));
}

};

__host__ __device__ bool operator<(const MyStruct1 &lhs, const MyStruct1 &rhs) { return (lhs.key < rhs.key); };

void main(void)
{
const int N = 10000;

float time;
cudaEvent_t start, stop;

/*******************************/
/* SORTING ARRAY OF STRUCTURES */
/*******************************/
thrust::host_vector<MyStruct1> h_struct1(N);
for (int i = 0; i<N; i++)
{
MyStruct1 s;
s.key = rand()*255;
s.value1 = rand()*255;
s.value2 = rand()*255;
h_struct1[i] = s;
}
thrust::device_vector<MyStruct1> d_struct(h_struct1);

cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

thrust::sort(d_struct.begin(), d_struct.end());

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Sorting array of structs - elapsed time: %3.1f ms \n", time);

h_struct1 = d_struct;

//for (int i = 0; i<N; i++)
//{
// MyStruct1 s = h_struct1[i];
// printf("key %i value1 %i value2 %i\n",s.key,s.value1,s.value2);
//}
//printf("\n\n");

/*******************************/
/* SORTING STRUCTURES OF ARRAYS*/
/*******************************/

MyStruct2 d_struct2(N);
thrust::host_vector<int> h_temp_key(N);
thrust::host_vector<int> h_temp_value1(N);
thrust::host_vector<int> h_temp_value2(N);

//for (int i = 0; i<N; i++)
//{
// h_temp_key[i] = rand()*255;
// h_temp_value1[i] = rand()*255;
// h_temp_value2[i] = rand()*255;
// printf("Original data - key %i value1 %i value2 %i\n",h_temp_key[i],h_temp_value1[i],h_temp_value2[i]);
//}
//printf("\n\n");

cudaMemcpy(d_struct2.key,h_temp_key.data(),N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_struct2.value1,h_temp_value1.data(),N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_struct2.value2,h_temp_value2.data(),N*sizeof(int),cudaMemcpyHostToDevice);

// wrap raw pointers with device pointers
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

thrust::device_ptr<int> dev_ptr_key = thrust::device_pointer_cast(d_struct2.key);
thrust::device_ptr<int> dev_ptr_value1 = thrust::device_pointer_cast(d_struct2.value1);
thrust::device_ptr<int> dev_ptr_value2 = thrust::device_pointer_cast(d_struct2.value2);

thrust::device_vector<int> d_indices(N);
thrust::sequence(d_indices.begin(), d_indices.end(), 0, 1);

// first sort the keys and indices by the keys
thrust::sort_by_key(dev_ptr_key, dev_ptr_key + N, d_indices.begin());

// Now reorder the ID arrays using the sorted indices
thrust::gather(d_indices.begin(), d_indices.end(), dev_ptr_value1, dev_ptr_value1);
thrust::gather(d_indices.begin(), d_indices.end(), dev_ptr_value2, dev_ptr_value2);

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Sorting struct of arrays - elapsed time: %3.1f ms \n", time);

cudaMemcpy(h_temp_key.data(),d_struct2.key,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_temp_value1.data(),d_struct2.value1,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_temp_value2.data(),d_struct2.value2,N*sizeof(int),cudaMemcpyDeviceToHost);

//for (int i = 0; i<N; i++) printf("Ordered data - key %i value1 %i value2 %i\n",h_temp_key[i],h_temp_value1[i],h_temp_value2[i]);
//printf("\n\n");

getchar();

}

为了简单起见,我跳过了添加适当的 CUDA 错误检查 What is the canonical way to check for errors using the CUDA runtime API? .

关于在 CUDA 中对结构数组进行排序,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/23541503/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com