gpt4 book ai didi

c - 将 openMp 程序移植到 cuda c : correct grid_size/block_size and reduction

转载 作者:太空宇宙 更新时间:2023-11-04 01:38:10 25 4
gpt4 key购买 nike

我想将 openMP 程序转换为 cuda c。
我尝试在网络和 sdk 上找到自己的出路。但是 Material 超出了我的水平。
我的 c 程序遍历 n=2^30 索引并添加每个索引的权重。

1) 正确的 grid_size 和 block_size 是多少?
我的猜测是复制 openMP 并执行

grid_size=n/max_number_of_cuda_threads;
block_size=1;

2) 如何在 cuda 中实现 openMP 缩减?
我尝试了一个 cudaMemcpy 然后在标准 c 中减少数组,但它看起来很慢。
我查看了 thrust 库及其 reduce 运算符。但我不知道如何将它与我当前的代码集成。

程序.c

#include <math.h>
#include <omp.h>

float get_weigth_of_index(long index,float* data){
int i;
float v=0;
for(i=0;i<4;i++)
v+=index*data[i];
return v;
}

int main(){
long i;
float r=0;
long n=pow(2,30);
float data[4]={0,1,2,3};
#pragma omp parallel for reduction (+:r)
for(i=0;i<n;i++)
r+=get_weigth_of_index(i,data);
return 0;
}

程序.cu

#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <math.h>

__device__ float get_weigth_of_index(long index,float* data){
int i;
float v=0;
for(i=0;i<4;i++)
v+=index*data[i];
return v;
}

__global__ void looper(long max_number_of_cuda_threads, float* data,float* result){
long bid=blockIdx.x;
long start=bid*max_number_of_cuda_threads;
long end=start+max_number_of_cuda_threads;
long i;
float r=0;
for(i=start;i<end;i++)
r+=get_weigth_of_index(i,data);
result[bid]=r;
}

int main(){
long n=pow(2,30);
int max_number_of_cuda_threads=1024; //I'm not sure it's correct
long grid_size=n/max_number_of_cuda_threads;
long block_size=1;

float data_host[4]={0,1,2,3};
float* data_device=0;
float* result_device=0;
cudaMalloc((void**)&data_device, sizeof(int)*4);
cudaMemcpy(data_device, data_host, sizeof(int)*4, cudaMemcpyHostToDevice);
cudaMalloc((void**)&result_device, sizeof(float)*grid_size);

looper<<<grid_size,block_size>>>(max_number_of_cuda_threads,data_device,result_device);

//reduction with standard c: cudaMemcpy seems slow
float* result_host=(float*)malloc(sizeof(float)*grid_size);
cudaMemcpy(result_host, result_device, sizeof(float)*grid_size, cudaMemcpyDeviceToHost);

long i;
float v=0;
#pragma omp parallel for reduction(+:v)
for(i=0;i<grid_size;i++)
v+=result_host[i];
printf("result:%f",v);

return 0;
}

我的显卡

Device 0: "Tesla M2050"
Number of multiprocessors: 14
Number of cores: 448
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 32768
Warp size: 32
Maximum number of threads per block: 1024
Maximum sizes of each dimension of a block: 1024 x 1024 x 64
Maximum sizes of each dimension of a grid: 65535 x 65535 x 1
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes

最佳答案

我认为 thrust::transform_reduce 可以解决您的问题。此代码显示了如何使用它:

#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <cmath>

struct get_weigth_of_index
{

get_weigth_of_index(float* data, size_t n)
{
cudaMalloc((void**)&_data,n * sizeof(float));
cudaMemcpy(_data, data, n * sizeof(float), cudaMemcpyHostToDevice);
_n = n;
}

float* _data;
size_t _n;
__host__ __device__
float operator()(const int& index) const
{
float v=0;
for(size_t i=0; i<_n; i++)
v += index * _data[i];
return v;
}
};

int main(void)
{

float x[4] = {1.0, 2.0, 3.0, 4.0};

size_t len = 1024; // init your value
float * index //init and fill you array here
// transfer to device
thrust::device_vector<float> d_index(index, index + len);

get_weigth_of_index unary_op(x, 4);
thrust::plus<float> binary_op;
float init = 0;

float sum = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);

std::cout << sum<< std::endl;

return 0;
}

关于c - 将 openMp 程序移植到 cuda c : correct grid_size/block_size and reduction,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/10766114/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com