gpt4 book ai didi

c - 为什么 OMP 任务运行速度比 OMP 慢?

转载 作者:太空狗 更新时间:2023-10-29 15:37:33 26 4
gpt4 key购买 nike

刚开始接触OPENMP,想用它来求解波动方程,串口代码在这里:

#include <time.h>
#include <stdio.h>
#include <omp.h>
#include <math.h>

#define GRID_SZ 3000
#define ARR_SZ GRID_SZ * GRID_SZ
#define PEAK_SZ 31

double *process_withoutomp() {
double start = omp_get_wtime();
int i, j;
double dt = 0.04, C = 16, K = 0.1, h = 6;
double *data, *olddata, *newdata, *tmp;
double x[PEAK_SZ][PEAK_SZ], linspace[PEAK_SZ], delta = 2.0/(PEAK_SZ-1.0);
data = (double*)malloc(sizeof(double)*ARR_SZ);
olddata = (double*)malloc(sizeof(double)*ARR_SZ);
newdata = (double*)malloc(sizeof(double)*ARR_SZ);

for(i = 0; i < ARR_SZ; i++){
data[i] = 1.0;
}

for(i = 0; i < PEAK_SZ; i++){
linspace[i] = -1.0 + delta * i;
}

for(i = 0; i < PEAK_SZ; i++){
for(j = 0; j < PEAK_SZ; j++){
x[i][j] = linspace[i];
}
}

for(i = 0; i < PEAK_SZ; i++){
for(j = 0; j < PEAK_SZ; j++){
data[(i+20)*GRID_SZ+j+20] += h * exp( -5 * (pow(x[i][j], 2 ) + pow(x[j][i], 2 )));
}
}

for(i = 0; i < ARR_SZ; i++){
olddata[i] = data[i];
}

for(i = 0; i < 20; i++){
sequential_update_withoutomp( data, olddata, newdata, C, K, dt);
tmp = olddata;
olddata = data;
data = newdata;
newdata = tmp;
}
double end = omp_get_wtime();
printf("without omp spend: %f\n",end-start);

return data;}void sequential_update_withoutomp(double *data, double *olddata, double *newdata, double C, double K, double dt ){
int i, j, add_i, sub_i, add_j, sub_j;
double pot;
for( i = 0; i < GRID_SZ; i++){
for( j = 0; j < GRID_SZ; j++){
add_i = i+1 >= GRID_SZ ? i : i+1;
add_j = j+1 >= GRID_SZ ? j : j+1;
sub_i = i-1 < 0 ? 0 : i-1;
sub_j = j-1 < 0 ? 0 : j-1;
pot = data[add_i*GRID_SZ+j]+
data[sub_i*GRID_SZ+j]+
data[add_j+i*GRID_SZ]+
data[sub_j+i*GRID_SZ]-
4*data[i*GRID_SZ+j];
newdata[i * GRID_SZ + j] =
( pow(C * dt, 2) * pot * 2 + 4 * data[i * GRID_SZ + j] - olddata[i * GRID_SZ + j] *(2 - K * dt) ) / (2 + K * dt);
}
}}

这里是使用的版本:

double *process_withomp() {
double start = omp_get_wtime();

int i, j;
double dt = 0.04, C = 16, K = 0.1, h = 6;
double *data, *olddata, *newdata, *tmp;
double x[PEAK_SZ][PEAK_SZ], linspace[PEAK_SZ], delta = 2.0/(PEAK_SZ-1.0);
data = (double*)malloc(sizeof(double)*ARR_SZ);
olddata = (double*)malloc(sizeof(double)*ARR_SZ);
newdata = (double*)malloc(sizeof(double)*ARR_SZ);

#pragma omp parallel for private(i) schedule(auto)
for(i = 0; i < ARR_SZ; i++){
data[i] = 1.0;
}

#pragma omp parallel for private(i,j) schedule(auto)
for(i = 0; i < PEAK_SZ; i++){
linspace[i] = -1.0 + delta * i;
for(j = 0; j < PEAK_SZ; j++) {
x[i][j] = linspace[i];
}
}

#pragma omp barrier

#pragma omp parallel for private(i,j) schedule(auto)
for(i = 0; i < PEAK_SZ; i++){
for(j = 0; j < PEAK_SZ; j++){
data[(i+20)*GRID_SZ+j+20] += h * exp( -5 * (pow(x[i][j], 2 ) + pow(x[j][i], 2 )));
}
}

#pragma omp barrier

#pragma omp parallel for private(i) schedule(auto)
for(i = 0; i < ARR_SZ; i++){
olddata[i] = data[i];
}

#pragma omp barrier

for(i = 0; i < 20; i++) {
sequential_update_withomp( data, olddata, newdata, C, K, dt);
tmp = olddata;
olddata = data;
data = newdata;
newdata = tmp;
}


double end = omp_get_wtime();
printf("with omp spend: %f\n",end-start);
return data;}void sequential_update_withomp(double *data, double *olddata, double *newdata, double C, double K, double dt ) {
int i, j;
double pot;
#pragma omp parallel for private(i,j,pot) schedule(auto)
for( i = 0; i < GRID_SZ; i++) {
for( j = 0; j < GRID_SZ; j++) {
pot = data[(i+1 >= GRID_SZ ? i : i+1)*GRID_SZ+j]+
data[(i-1 < 0 ? 0 : i-1)*GRID_SZ+j]+
data[(j+1 >= GRID_SZ ? j : j+1)+i*GRID_SZ]+
data[(j-1 < 0 ? 0 : j-1)+i*GRID_SZ]
-4*data[i*GRID_SZ+j];
newdata[i * GRID_SZ + j] =
(pow(C * dt, 2) * pot * 2 + 4 * data[i * GRID_SZ + j] - olddata[i * GRID_SZ + j]
* (2 - K * dt))
/ (2 + K * dt);
}
}}

这个版本运行良好,但是当我尝试使用任务替换它时,结果是正确的,但花费的时间更多:

double *process_withomp1() {
double start = omp_get_wtime();

int i, j;
double dt = 0.04, C = 16, K = 0.1, h = 6;
double *data, *olddata, *newdata, *tmp;
double x[PEAK_SZ][PEAK_SZ], linspace[PEAK_SZ], delta = 2.0/(PEAK_SZ-1.0);
data = (double*)malloc(sizeof(double)*ARR_SZ);
olddata = (double*)malloc(sizeof(double)*ARR_SZ);
newdata = (double*)malloc(sizeof(double)*ARR_SZ);

#pragma omp parallel for private(i) schedule(auto)
for(i = 0; i < ARR_SZ; i++){
data[i] = 1.0;
}

#pragma omp parallel for private(i,j) schedule(auto)
for(i = 0; i < PEAK_SZ; i++){
linspace[i] = -1.0 + delta * i;
for(j = 0; j < PEAK_SZ; j++) {
x[i][j] = linspace[i];
}
}

#pragma omp barrier

#pragma omp parallel for private(i,j) schedule(auto)
for(i = 0; i < PEAK_SZ; i++){
for(j = 0; j < PEAK_SZ; j++){
data[(i+20)*GRID_SZ+j+20] += h * exp( -5 * (pow(x[i][j], 2 ) + pow(x[j][i], 2 )));
}
}

#pragma omp barrier

#pragma omp parallel for private(i) schedule(auto)
for(i = 0; i < ARR_SZ; i++){
olddata[i] = data[i];
}

#pragma omp barrier

for(i = 0; i < 20; i++) {
sequential_update_withomp1( data, olddata, newdata, C, K, dt);
tmp = olddata;
olddata = data;
data = newdata;
newdata = tmp;
}


double end = omp_get_wtime();
printf("with omp spend: %f\n",end-start);
return data;}
void sequential_update_withomp1(double *data, double *olddata, double *newdata, double C, double K, double dt ) {
int i, j;
double pot;
#pragma omp parallel private(i,j,pot)
for( i = 0; i < GRID_SZ; i++) {
for( j = 0; j < GRID_SZ; j++) {
#pragma omp task
{
pot = data[(i+1 >= GRID_SZ ? i : i+1)*GRID_SZ+j]+
data[(i-1 < 0 ? 0 : i-1)*GRID_SZ+j]+
data[(j+1 >= GRID_SZ ? j : j+1)+i*GRID_SZ]+
data[(j-1 < 0 ? 0 : j-1)+i*GRID_SZ]
-4*data[i*GRID_SZ+j];
newdata[i * GRID_SZ + j] =
(pow(C * dt, 2) * pot * 2 + 4 * data[i * GRID_SZ + j] - olddata[i * GRID_SZ + j]
* (2 - K * dt))
/ (2 + K * dt);
}
}
}}

在我的 mac 上,serial 版本大约需要 7.7s,for 版本需要 3.7s,但是任务使用 53s。

有人知道这里出了什么问题吗?

提前致谢

最佳答案

这里有两点需要考虑:

  • a) 线程粒度,即每个线程的工作量
  • b) 任务创建的方式

在您的代码中,a) 太小,b) 已损坏。

一)在您的 task 示例中,内部循环的一次迭代 是一项任务,而在 parallel for 示例中,n 次迭代外循环是并行化的,即每个线程处理外循环的大量迭代。使用 schedule(static, 1),一个外部迭代将是每个线程的工作大小。请记住,所有并行都会增加开销,用于同步内容、簿记等。增加的成本必须通过并行执行的执行速度提高来补偿。找到合适的工作量是至关重要的,您希望尽可能多地保持一切忙碌,也许更多的工作量可以给调度程序一些空间来补偿任务/ block 之间的负载不平衡,但要尽可能少以保持较小的开销。

b)在您的并行区域中运行循环,意味着每个线程都在运行整个循环嵌套并多次创建所有任务。这就像并行运行串行程序多次。

void sequential_update_withomp1(double *data, double *olddata, double *newdata, double C, double K, double dt ) {
// ....
#pragma omp parallel private(i,j,pot)
{
// split loop among threads of parallel region
// i.e. create tasks in parallel
#pragma omp for
for( i = 0; i < GRID_SZ; i++) {
// coarse grained tasks (as in parallel for version)
#pragma omp task
{
// each inner for loop is one task
for( j = 0; j < GRID_SZ; j++) {
// ...
}
} // task
} // parallel for
} // parallel region

这给了我(2 个核心 x 2 个超线程):

serial:        4.839213
parallel for: 2.529813
task: 2.817615

注意:在这里实际使用任务没有意义,因为它们只会在并行 for 循环之上增加开销。

关于c - 为什么 OMP 任务运行速度比 OMP 慢?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/48129831/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com