gpt4 book ai didi

c++ - Cuda C++ 设计 : reusable class with unknown compile-time size

转载 作者:行者123 更新时间:2023-11-28 04:39:40 31 4
gpt4 key购买 nike

我正在寻找一种方便的设计,以便能够在编译时大小未知的设备上使用类。只需将此类的一个实例发送到设备,为此应该调用一次 cudaMalloc 和 cudaMemcpy(理想情况下)。

类的主机版本如下所示:

Class A {
public:
A(int size) : table(size) {
// some useful initialization of table
}
double get(int i) const {
// return some processed element from table
}
private:
std::vector<int> table;
};

内核:

__global__ void kernel(const A *a){
int idx = threadIdx.x + blockDim.x * blockIdx.x;
a->get(idx); // do something useful with it
}

到目前为止,我设计类的设备版本的方式是这样的:

const int sizeMax = 1000;
Class A {
public:
A(int size) {
// size checking + some useful initialization of table
}
__host__ __device__
double get(int i) const {
//
}
private:
int table[sizeMax];
};

和客户端代码:

A a(128);
A* da;
cudaMalloc((void**)&da, sizeof(A));
cudaMemcpy(da, &a, sizeof(A), cudaMemcpyHostToDevice);
kernel<<<1, 32>>>(da);
cudaDeviceSynchronize();
cudaFree(da);

这很丑陋,因为:

  • 它必须使用太大的 sizeMax 来浪费带宽安全起见
  • 该类未关闭修改,sizeMax 的值将不可避免地需要在某些时候提出

是否有任何其他方法可以更简洁地实现相同的目标而不会对性能产生负面影响?明确地说,我只需要类的设备版本,第一个版本只是等效的非 CUDA 代码来说明表大小应该是动态的。

最佳答案

在我的评论中,我说:

  1. separate host and device storage for table, contained in the class, both of which are allocated dynamically. 2. dynamic allocation of table storage size in the constructor, rather than in your client code. This could also include resizing if necessary. 3. differentiation in class methods to use either the host copy of the data or the device copy (i.e. pointer) to the data, depending on whether the method is being executed in host or device code 4. A method to copy data from host to device or vice versa, as the class context is moved from host to device or vice versa.

这是我的想法的一个例子:

#include <stdio.h>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <iostream>


template <typename T>
class gpuvec{
private:
T *h_vec = NULL;
T *d_vec = NULL;
size_t vsize = 0;
bool iscopy;
public:
__host__ __device__
T * data(){
#ifndef __CUDA_ARCH__
return h_vec;
#else
return d_vec;
#endif
}
__host__ __device__
T& operator[](size_t i) {
assert(i < vsize);
return data()[i];}
void to_device(){
assert(cudaMemcpy(d_vec, h_vec, vsize*sizeof(T), cudaMemcpyHostToDevice) == cudaSuccess);}
void to_host(){
assert(cudaMemcpy(h_vec, d_vec, vsize*sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess);}
gpuvec(gpuvec &o){
h_vec = o.h_vec;
d_vec = o.d_vec;
vsize = o.vsize;
iscopy = true;}
void copy(gpuvec &o){
free();
iscopy = false;
vsize = o.vsize;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);
memcpy(h_vec, o.h_vec, vsize*sizeof(T));
assert(cudaMemcpy(d_vec, o.d_vec, vsize*sizeof(T), cudaMemcpyDeviceToDevice) == cudaSuccess);}
gpuvec(size_t ds) {
assert(ds > 0);
iscopy = false;
vsize = ds;
h_vec = (T *)malloc(vsize*sizeof(T));
assert(h_vec != NULL);
assert(cudaMalloc(&d_vec, vsize*sizeof(T)) == cudaSuccess);}
gpuvec(){
iscopy = false;
}
~gpuvec(){
if (!iscopy) free();}
void free(){
if (d_vec != NULL) cudaFree(d_vec);
d_vec = NULL;
if (h_vec != NULL) ::free(h_vec);
h_vec = NULL;}
__host__ __device__
size_t size() {
return vsize;}
};

template <typename T>
__global__ void test(gpuvec<T> d){
for (int i = 0; i < d.size(); i++){
d[i] += 1;
}
}


int main(){
size_t ds = 10;
gpuvec<int> A(ds);
A.to_device();
test<<<1,1>>>(A);
A.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << A[i];
std::cout << std::endl;
gpuvec<int> B;
B.copy(A);
A.free();
B.to_device();
test<<<1,1>>>(B);
B.to_host();
for (size_t i = 0; i < ds; i++)
std::cout << B[i];
std::cout << std::endl;
B.free();
}

我敢肯定会有很多批评。这可能不符合任何关于“vector 语法”应该是什么的特定意见。此外,我确信它没有涵盖一些用例,并且可能包含彻底的缺陷。创建一个健壮的主机/设备 vector 实现可能需要与thrust一样多的工作和复杂性。主机和设备 vector 。然而,我并不是说推力 vector 是问题的直接答案。

关于c++ - Cuda C++ 设计 : reusable class with unknown compile-time size,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50509600/

31 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com