gpt4 book ai didi

c - 用Openmp初始化数据【浅水算法】

转载 作者:行者123 更新时间:2023-11-30 17:02:46 32 4
gpt4 key购买 nike

首先,我的英语水平很差,如果有写得不好的地方,很抱歉......

我正在学习如何使用 OpenMP 并行化 C 代码,我尝试并行化的算法是浅水方程算法,尽管在大多数情况下使用简单的 #pragma omp parallel for关键循环 我获得了近 40% 的性能提升,我知道我的实现非常糟糕,而且我没有像我应该的那样充分利用核心。代码的结构很简单:一个“main”,它分配内存并初始化一些矩阵和数组,并调用一个名为“求解器”的函数来完成所有工作,我在其中放置了#pragma omp parallel for

我认为我可以使用并行部分来提高性能,其中分配和初始化内存,以便每个线程都拥有所有数据,但是当我运行程序时,我没有任何提升,因为我作为一个新手,我不知道是我的想法不好还是我的实现不好。我将感谢一些可以提高算法性能的帮助或提示。这是我的作业,我不想有人帮我做,只是一点点帮助,可以让我前进......

<小时/>

我将粘贴代码以便更好地理解:

主要功能(分配和初始化)

int main(int argc, char **argv) {

long int i, j, m, n, M, N;
char *ptr;
long int s;
int flag, verbose;
double *Q;
double *x, *y;
double **ffx, **nFx, **ffy, **nFy;
double dx, dt, epsi, delta, dy, tend, tmp, stime;

/* Default values to use: m volumes in the x-direction and n volumes in the y-direction
M = 1000;
N = 1000;

/* create file and verbose flags */
.......
.......

/* Parse command line options */
.......
.......


epsi = 2.0;
delta = 0.5;
dx = (xend - xstart) / (double) M;
dy = (yend - ystart) / (double) N;
dt = dx / sqrt( 9.81 * 5.0);
tend = 0.1;

/* Add two ghost volumes at each side of the domain */
m = M + 2;
n = N + 2;

/* Allocate memory for the domain */

/*HERE IS WHRE I PUT THE PRAGMA FOR PARALLEL INITIALIZATION AND ALLOCATIONS*/
#pragma omp parallel
{
Q = (double *) malloc(m * n * cell_size * sizeof(double));

x = (double *) malloc(m * sizeof(double));
y = (double *) malloc(n * sizeof(double));

/* Allocate memory for fluxes */
ffx = (double **) malloc(cell_size * sizeof(double *));
ffy = (double **) malloc(cell_size * sizeof(double *));
nFx = (double **) malloc(cell_size * sizeof(double *));
nFy = (double **) malloc(cell_size * sizeof(double *));

ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
nFy[0] = (double *) malloc(cell_size * n * sizeof(double));

for (i = 0; i < cell_size; i++) {
ffx[i] = ffx[0] + i * m;
nFx[i] = nFx[0] + i * m;
ffy[i] = ffy[0] + i * n;
nFy[i] = nFy[0] + i * n;
}

for (i = 0,tmp= -dx/2 + xstart; i < m; i++, tmp += dx)
x[i] = tmp;

for (i = 0,tmp= -dy/2 + ystart; i < n; i++, tmp += dy)
y[i] = tmp;

/* Set initial Gauss hump */
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
Q(0, i, j) = 4.0;
Q(1, i, j) = 0.0;
Q(2, i, j) = 0.0;
}
}

for (i = 1; i < m-1; i++) {
for (j = 1; j < n-1; j++) {
Q(0, i, j) = 4.0 + epsi * exp(-(pow(x[i] - xend / 4.0, 2) + pow(y[j] - yend / 4.0, 2)) /
(pow(delta, 2)));
}
}
}

// Record start time
stime = gettime();
/*THIS IS THE FUNCTION WHERE THE 'WORK' IS DONE*/
solver(Q, ffx, ffy, nFx, nFy, m, n, tend, dx, dy, dt);`
}

求解器函数(关键部分)

/*
This is the main solver routine.
*/
void solver(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double tend, double dx, double dy, double dt) {
double bc_mask[3] = {1.0, -1.0, -1.0};
double time;
int i, j, k, steps;

steps = ceil(tend / dt);
for (i = 0, time = 0.0; i < steps; i++, time += dt) {

/* Apply boundary condition */
#pragma omp parallel for private(j) num_threads (NTHR)
for (k = 0; k < cell_size; k++)
{
for (j = 1; j < n - 1 ; j++)
{
Q(k, 0, j) = bc_mask[k] * Q(k, 1, j);
Q(k, m-1, j) = bc_mask[k] * Q(k, m-2, j);
}
}
#pragma omp parallel for private(j) num_threads (NTHR)
for (k = 0; k < cell_size; k++)
{
for (j = 0; j < m; j++)
{
Q(k, j, 0) = bc_mask[k] * Q(k, j, 1);
Q(k, j, n-1) = bc_mask[k] * Q(k, j, n-2);
}
}

/* Update all volumes with the Lax-Friedrich's scheme */
laxf_scheme_2d(Q, ffx, ffy, nFx, nFy, m, n, dx, dy, dt);

}
}

/*
This is the Lax-Friedrich's scheme for updating volumes
*/
void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double dx, double dy, double dt) {
int i, j, k;

/* Calculate and update fluxes in the x-direction */
#pragma omp parallel for private(k,j) num_threads (NTHR)
for (i = 1; i < n; i++) {
fx(Q, ffx, m, n, i);
for (k = 0; k < cell_size; k++)
for (j = 1; j < m; j++)
nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) - dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
for (k = 0; k < cell_size; k++)
for (j = 1; j < m-1; j++)
Q(k, j, i) = Q(k, j, i) - dt/dx * ((nFx[k][j+1] - nFx[k][j]));

}

/* Calculate and update fluxes in the y-direction */
#pragma omp parallel for private(k,j) num_threads (NTHR)
for (i = 1; i < m; i++) {
fy(Q, ffy, m, n, i);
for (k = 0; k < cell_size; k++)
for (j = 1; j < n; j++)
nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) - dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
for (k = 0; k < cell_size; k++)
for (j = 1; j < n-1; j++)
Q(k,i,j) = Q(k,i,j) - dt/dy * ((nFy[k][j+1] - nFy[k][j]));
}

}

据我了解,求解器函数及其子函数的循环中没有数据依赖性,并且由于在分配和数据初始化中放置并行区域没有执行任何操作,因此我不知道如何继续。

提前致谢!

最佳答案

您的代码存在多个问题。首先,由于您写入共享变量,例如Q,因此存在数据争用。 , x ,和y ,由所有线程。要么在并行区域之外进行分配,要么仅通过单个线程执行分配( #pragma omp master#pragma omp single )。

然后,您不会并行化 for在初始化部分循环。事实上,所有这些循环都是由整个范围内的所有线程执行的(同样存在数据竞争,并且可能存在大量缓存争用)。您应该添加#pragma omp parallel到这些循环。对于嵌套循环,collapse指令可能有用。

此外,请确保solver()中没有数据争用。和laxf_scheme_2d()功能。看起来,计算时间最多的地方是laxf_scheme_2d()内。然而,这个函数根本不是并行运行的。它内部使用 OpenMP 吗?

关于c - 用Openmp初始化数据【浅水算法】,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36445842/

32 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com