gpt4 book ai didi

c - 如何使用 OpenMP 并行化在矩阵上进行迭代的 while 循环?

转载 作者:行者123 更新时间:2023-12-03 13:16:31 26 4
gpt4 key购买 nike

我正在尝试并行化此代码的 while 循环。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define n 100

float max_dif(float w_new[n][n], float w[n][n]);

int main()
{
int i,j;
float w[n][n],w_new[n][n]={0},max=100;

#pragma omp parallel for private(j)
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
w[i][j]=75;
}
}

#pragma omp parallel for
for (i = 0; i < n; i++)
{
w[0][i]=0;
w[n-1][i]=100;
w[i][0]=100;
w[i][n-1]=100;

w_new[0][i]=0;
w_new[n-1][i]=100;
w_new[i][0]=100;
w_new[i][n-1]=100;
}

w[0][n-1]=100;
w_new[0][n-1]=100;

int counter=0;

while(max > 0.0001)
{
for (i = 1; i < n-1; i++)
{
for (j = 1; j < n-1; j++)
{
w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
}
}
max = max_dif(w_new,w);
for (i = 0; i < n; i++)
{
for (j = 0; j < n; j++)
{
w[i][j]=w_new[i][j];
}
}
counter++;
}

printf("Counter is: %d\n", counter);
return 0;
}

float max_dif(float w_new[n][n], float w[n][n])
{
int i,j;
float max=0;
for (i = 1; i < n-1; i++)
{
for (j = 1; j < n-1; j++)
{
if (max < fabs(w_new[i][j]-w[i][j]))
max = fabs(w_new[i][j]-w[i][j]);
}
}
return max;
}
我当时正在考虑构建的基本并行,但我的计数器变量预计在 5000 左右。所以我希望它是并行的。但我需要将 w_new 复制回 w,以便计算 w_new 的下一次迭代。所以我不认为任务或部分会起作用,因为我需要一个 w 来做另一个。

最佳答案

直接并行化 while(max > 0.0001)使用 OpenMP 会很复杂,而且性能可能也不会那么好。在 OpenMP 中,您有两种主要的并行化代码方法,即使用基于循环的并行性或基于任务的并行性。后者不适用于当前代码,前者不能应用于在编译时无法确定该循环执行的迭代次数的循环。
但是,您可以并行化在 while 内执行的计算,例如:

  #pragma omp parallel
{
#pragma omp for
for (int i = 1; i < n-1; i++){
for (int j = 1; j < n-1; j++){
w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
}
}
max_dif绝对是并行化的良好候选者:
#pragma omp for reduction(max: max)
for (int i = 1; i < n-1; i++){
for (int j = 1; j < n-1; j++){
if (max < fabs(w_new[i][j]-w[i][j]))
max = fabs(w_new[i][j]-w[i][j]);
}
}
最后是while内的最后一个循环:
    #pragma omp for
for (int i = 0; i < n; i++){
for (int j = 0; j < n; j++){
w[i][j]=w_new[i][j];
}
}
不幸的是,要应用这种方法,您必须内联函数 max_dif因为要在归约子句中使用的变量必须是共享的,并且因为 max分配在并行区域内,它将是每个线程私有(private)的。使用 shared #pragma omp for 中的子句不允许。
完整示例:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define n 100
int main()
{
float w[n][n],w_new[n][n]={0},max=100;

#pragma omp parallel for
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
w[i][j]=75;

#pragma omp parallel for
for (int i = 0; i < n; i++){
w[0][i]=0;
w[n-1][i]=100;
w[i][0]=100;
w[i][n-1]=100;

w_new[0][i]=0;
w_new[n-1][i]=100;
w_new[i][0]=100;
w_new[i][n-1]=100;
}

w[0][n-1]=100;
w_new[0][n-1]=100;

int counter=0;

while(max > 0.0001){
#pragma omp parallel
{
max = 0;
#pragma omp for
for (int i = 1; i < n-1; i++)
for (int j = 1; j < n-1; j++)
w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);

#pragma omp for reduction(max: max)
for (int i = 1; i < n-1; i++){
for (int j = 1; j < n-1; j++){
if (max < fabs(w_new[i][j]-w[i][j]))
max = fabs(w_new[i][j]-w[i][j]);
}
}
#pragma omp for nowait
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
w[i][j]=w_new[i][j];
}
counter++;
}

printf("Counter is: %d\n", counter);
return 0;
}
输出:
Counter is: 4894

I was thinking of the basic parallel for construct inside the while,but my counter variable is expected to be around 5000. So I would likethat to be in parallel.


这是可能的,但需要进行一些更改。我在代码中解释它们。实际上,我们可以为整个代码使用单个并行区域:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define n 100

int main()
{
float w[n][n],w_new[n][n]={0},max=100;
int counter = 0;
#pragma omp parallel
{
#pragma omp for
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
w[i][j]=75;

#pragma omp for
for (int i = 0; i < n; i++){
w[0][i]=0;
w[n-1][i]=100;
w[i][0]=100;
w[i][n-1]=100;

w_new[0][i]=0;
w_new[n-1][i]=100;
w_new[i][0]=100;
w_new[i][n-1]=100;
}

w[0][n-1]=100;
w_new[0][n-1]=100;


while(max > 0.0001){
// We need this barrier to ensure that we don't have a
// race condition between the master updating max to zero
// and the other threads reading max > 0.0001
#pragma omp barrier
#pragma omp master
#pragma omp atomic write
max = 0;
#pragma omp barrier

#pragma omp for reduction(max:max)
for (int i = 1; i < n-1; i++){
for (int j = 1; j < n-1; j++){
w_new[i][j]=(float)((w[i+1][j]+w[i-1][j]+w[i][j+1]+w[i][j-1])/4);
if (max < fabs(w_new[i][j]-w[i][j]))
max = fabs(w_new[i][j]-w[i][j]);
}
}

#pragma omp for nowait
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
w[i][j]=w_new[i][j];

#pragma omp master
counter++;
}
}
printf("Counter is: %d\n", counter);
return 0;
}
我进行了表面测试,对于 4 核,此版本的加速比为 2.8x对于 1000 的输入,没什么大不了的。可以通过尝试对其部分进行矢量化来进行进一步的改进。

关于c - 如何使用 OpenMP 并行化在矩阵上进行迭代的 while 循环?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/66714286/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com