gpt4 book ai didi

c++ - 串行工作时 Cuda 版本不工作

转载 作者:行者123 更新时间:2023-11-30 02:02:00 28 4
gpt4 key购买 nike

我的以下极简主义 Cuda 代码返回不正确的结果(所有多边形末尾都有 0 个顶点),而在 C++ 中串行运行的相同代码运行良好。问题是令人尴尬的并行:没有通信,没有同步线程等,而且 Cuda 内存分配是成功的。对于 Cuda 版本,即使是存储用于调试目的的输入数组内容的虚拟变量也是 0。没有越界访问,因为我的数组基本上足够大。在 Cuda 中用循环替换 memcpy 不会改变任何东西。
我真的不明白发生了什么......任何想法?谢谢!

库达代码:

    #include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <cuda.h>

class Point2D {
public:
__device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){};
double x, y;
};

__device__ double dot(const Point2D &A, const Point2D &B) {
return A.x*B.x + A.y*B.y;
}
__device__ Point2D operator*(double a, const Point2D &P) {
return Point2D(a*P.x, a*P.y);
}
__device__ Point2D operator+(Point2D A, const Point2D &B) {
return Point2D(A.x + B.x, A.y + B.y);
}
__device__ Point2D operator-(Point2D A, const Point2D &B) {
return Point2D(A.x - B.x, A.y - B.y);
}
__device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
Point2D M = 0.5*(C+D);
return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
}

class Polygon {
public:
__device__ Polygon():nbpts(0){};
__device__ void addPts(Point2D pt) {
pts[nbpts] = pt;
nbpts++;
};
__device__ Polygon& operator=(const Polygon& rhs) {
nbpts = rhs.nbpts;
dummy = rhs.dummy;
memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
return *this;
}
__device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) {

int new_nbpts = 0;
Point2D newpts[128];
Point2D AB(outside_pt-inside_pt);
Point2D M(0.5*(outside_pt+inside_pt));
double ABM = dot(AB, M);

Point2D S = pts[nbpts-1];

for (int i=0; i<nbpts; i++) {

Point2D E = pts[i];

double ddot = -ABM + dot(AB, E);
if (ddot<0) { // E inside clip edge
double ddot2 = -ABM + dot(AB, S);
if (ddot2>0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
newpts[new_nbpts] = E;
new_nbpts++;
} else {
double ddot2 = -ABM + dot(AB, S);
if (ddot2<0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
}
S = E;
}

memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D));
nbpts = new_nbpts;
}

//private:
Point2D pts[128];
int nbpts;
float dummy;
};


__global__ void cut_poly(float *a, Polygon* polygons, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx>=N/2) return;

Polygon pol;
pol.addPts(Point2D(0.,0.));
pol.addPts(Point2D(1.,0.));
pol.addPts(Point2D(1.,1.));
pol.addPts(Point2D(0.,1.));

Point2D curPt(a[2*idx], a[2*idx+1]);

for (int i=0; i<N/2; i++) {
Point2D other_pt(a[2*i], a[2*i+1]);
pol.cut(curPt, other_pt);
}
pol.dummy = a[idx];

polygons[idx] = pol;
}



int main(int argc, unsigned char* argv[])
{

const int N = 100;
float a_h[N], *a_d;
Polygon p_h[N/2], *p_d;

size_t size = N * sizeof(float);
size_t size_pol = N/2 * sizeof(Polygon);

cudaError_t err = cudaMalloc((void **) &a_d, size);
cudaError_t err2 = cudaMalloc((void **) &p_d, size_pol);

for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N);

cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost);

for (int i=0; i<N/2; i++)
printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts);

cudaFree(a_d);
cudaFree(p_d);


return 0;
}

在 C++ 中可以正常工作的相同代码:

#include <stdio.h>
#include <iostream>
#include <stdlib.h>

class Point2D {
public:
Point2D(double xx=0, double yy=0):x(xx),y(yy){};
double x, y;
};

double dot(const Point2D &A, const Point2D &B) {
return A.x*B.x + A.y*B.y;
}
Point2D operator*(double a, const Point2D &P) {
return Point2D(a*P.x, a*P.y);
}
Point2D operator+(Point2D A, const Point2D &B) {
return Point2D(A.x + B.x, A.y + B.y);
}
Point2D operator-(Point2D A, const Point2D &B) {
return Point2D(A.x - B.x, A.y - B.y);
}
Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
Point2D M = 0.5*(C+D);
return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
}

class Polygon {
public:
Polygon():nbpts(0){};
void addPts(Point2D pt) {
pts[nbpts] = pt;
nbpts++;
};
Polygon& operator=(const Polygon& rhs) {
nbpts = rhs.nbpts;
dummy = rhs.dummy;
memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
return *this;
}
void cut(const Point2D &inside_pt, const Point2D &outside_pt) {

int new_nbpts = 0;
Point2D newpts[128];
Point2D AB(outside_pt-inside_pt);
Point2D M(0.5*(outside_pt+inside_pt));
double ABM = dot(AB, M);

Point2D S = pts[nbpts-1];

for (int i=0; i<nbpts; i++) {

Point2D E = pts[i];

double ddot = -ABM + dot(AB, E);
if (ddot<0) { // E inside clip edge
double ddot2 = -ABM + dot(AB, S);
if (ddot2>0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
newpts[new_nbpts] = E;
new_nbpts++;
} else {
double ddot2 = -ABM + dot(AB, S);
if (ddot2<0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
}
S = E;
}

memcpy(pts, newpts, std::min(128, new_nbpts)*sizeof(Point2D));
/*for (int i=0; i<128; i++) {
pts[i] = newpts[i];
}*/
nbpts = new_nbpts;
}

//private:
Point2D pts[128];
int nbpts;
float dummy;
};


void cut_poly(int idx, float *a, Polygon* polygons, int N)
{
if (idx>=N/2) return;

Polygon pol;
pol.addPts(Point2D(0.,0.));
pol.addPts(Point2D(1.,0.));
pol.addPts(Point2D(1.,1.));
pol.addPts(Point2D(0.,1.));

Point2D curPt(a[2*idx], a[2*idx+1]);

for (int i=0; i<N/2; i++) {
if (idx==i) continue;
Point2D other_pt(a[2*i], a[2*i+1]);
pol.cut(curPt, other_pt);
}
pol.dummy = a[idx];

polygons[idx] = pol;
}



int main(int argc, unsigned char* argv[])
{

const int N = 100; // Number of elements in arrays
float a_h[N], *a_d; // Pointer to host & device arrays
Polygon p_h[N/2], *p_d;

for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;

for (int idx=0; idx<N; idx++)
cut_poly(idx, a_h, p_h, N);

for (int i=0; i<N/2; i++)
printf("%f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].nbpts);

return 0;
}

最佳答案

好吧,我猜你可以无视我的大部分评论。我错误地在一台我用 CUDA 3.2 设置的机器上工作,它在内核启动失败方面表现不同。当我切换到 CUDA 4.1 和 CUDA 5.0 时,事情开始变得有意义了。为我的困惑道歉。

不管怎样,在解决了这个问题之后,我很快注意到您的 CPU 和 GPU 实现之间存在差异。具体在这里(看CPU代码):

void cut_poly(int idx, float *a, Polygon* polygons, int N)
{
if (idx>=N/2) return;

Polygon pol;
pol.addPts(Point2D(0.,0.));
pol.addPts(Point2D(1.,0.));
pol.addPts(Point2D(1.,1.));
pol.addPts(Point2D(0.,1.));

Point2D curPt(a[2*idx], a[2*idx+1]);

for (int i=0; i<N/2; i++) {
if (idx==i) continue; /* NOTE THIS LINE MISSING FROM YOUR GPU CODE */
Point2D other_pt(a[2*i], a[2*i+1]);
pol.cut(curPt, other_pt);
}
pol.dummy = a[idx];

polygons[idx] = pol;
}

关于我在上面添加注释的那一行,如果您将该行代码添加到 cut_poly 内核中 GPU 代码的相应位置,那么无论如何对我来说 GPU 代码产生与 CPU 代码相同的打印结果。

我会做的另一个观察是,您正在不必要地运行只有 4 个线程的 block 。当你在解决代码中的问题时,这没有错,但是一旦你为了“生产”目的运行它,你很可能想要定位一个更高的数字,比如 256,并且一定要选择一个符合32 的整数倍,以获得最佳性能。

针对评论中发布的问题,我相信数据已被正确复制,但很可能您没有在主机上正确访问它。 (我不知道你是如何确定“我的阵列没有正确返回到主机”)。您的大部分类定义都是 __device__ 而已。因此,很难访问主机上类中的结构(例如 Polygon 类中的 Point2D pts 类)。我在这里插入修改后的代码,我认为它表明数据正在传输回主机:

    #include <stdio.h>
#include <iostream>
#include <stdlib.h>
// #include <cuda.h>

#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)


class Point2D {
public:
__host__ __device__ Point2D(double xx=0, double yy=0):x(xx),y(yy){};
double x, y;
};

__host__ __device__ double dot(const Point2D &A, const Point2D &B) {
return A.x*B.x + A.y*B.y;
}
__host__ __device__ Point2D operator*(double a, const Point2D &P) {
return Point2D(a*P.x, a*P.y);
}
__host__ __device__ Point2D operator+(Point2D A, const Point2D &B) {
return Point2D(A.x + B.x, A.y + B.y);
}
__host__ __device__ Point2D operator-(Point2D A, const Point2D &B) {
return Point2D(A.x - B.x, A.y - B.y);
}
__host__ __device__ Point2D inter(const Point2D &A, const Point2D &B, const Point2D &C, const Point2D &D) { //intersects AB by *the mediator* of CD
Point2D M = 0.5*(C+D);
return A - (dot(A-M, D-C)/dot(B-A, D-C)) * (B-A);
}

class Polygon {
public:
__host__ __device__ Polygon():nbpts(0){};
__host__ __device__ void addPts(Point2D pt) {
pts[nbpts] = pt;
nbpts++;
};
__host__ __device__ Polygon& operator=(const Polygon& rhs) {
nbpts = rhs.nbpts;
dummy = rhs.dummy;
memcpy(pts, rhs.pts, nbpts*sizeof(Point2D));
return *this;
}
__host__ __device__ Point2D getpoint(unsigned i){
if (i<128) return pts[i];
else return pts[0];
}
__host__ __device__ void cut(const Point2D &inside_pt, const Point2D &outside_pt) {

int new_nbpts = 0;
Point2D newpts[128];
Point2D AB(outside_pt-inside_pt);
Point2D M(0.5*(outside_pt+inside_pt));
double ABM = dot(AB, M);

Point2D S = pts[nbpts-1];

for (int i=0; i<nbpts; i++) {

Point2D E = pts[i];

double ddot = -ABM + dot(AB, E);
if (ddot<0) { // E inside clip edge
double ddot2 = -ABM + dot(AB, S);
if (ddot2>0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
newpts[new_nbpts] = E;
new_nbpts++;
} else {
double ddot2 = -ABM + dot(AB, S);
if (ddot2<0) {
newpts[new_nbpts] = inter(S,E, inside_pt, outside_pt);
new_nbpts++;
}
}
S = E;
}

memcpy(pts, newpts, min(128, new_nbpts)*sizeof(Point2D));
nbpts = new_nbpts;
}

//private:
Point2D pts[128];
int nbpts;
float dummy;
};


__global__ void cut_poly(float *a, Polygon* polygons, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx>=N/2) return;

Polygon pol;
pol.addPts(Point2D(0.,0.));
pol.addPts(Point2D(1.,0.));
pol.addPts(Point2D(1.,1.));
pol.addPts(Point2D(0.,1.));

Point2D curPt(a[2*idx], a[2*idx+1]);

for (int i=0; i<N/2; i++) {
if (idx==i) continue;
Point2D other_pt(a[2*i], a[2*i+1]);
pol.cut(curPt, other_pt);
}
pol.dummy = pol.getpoint(0).x;

polygons[idx] = pol;
}



int main(int argc, unsigned char* argv[])
{

const int N = 100;
float a_h[N], *a_d;
Polygon p_h[N/2], *p_d;

size_t size = N * sizeof(float);
size_t size_pol = N/2 * sizeof(Polygon);

cudaMalloc((void **) &a_d, size);
cudaCheckErrors("cm1");
cudaMalloc((void **) &p_d, size_pol);
cudaCheckErrors("cm2");

for (int i=0; i<N; i++) a_h[i] = (float)(rand()%1000)*0.001;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
cudaCheckErrors("cmcp1");

int block_size = 128;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
cut_poly <<< n_blocks, block_size >>> (a_d, p_d, N);
cudaCheckErrors("kernel");

cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaCheckErrors("cmcp2");
cudaMemcpy(p_h, p_d, sizeof(Polygon)*N/2, cudaMemcpyDeviceToHost);
cudaCheckErrors("cmcp3");

for (int i=0; i<N/2; i++)
printf("%f \t %f \t %f \t %u\n", a_h[i], p_h[i].dummy, p_h[i].getpoint(0).x, p_h[i].nbpts);

cudaFree(a_d);
cudaFree(p_d);


return 0;
}

我建议针对这些问题发布新问题。

关于c++ - 串行工作时 Cuda 版本不工作,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/13630817/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com