gpt4 book ai didi

c++ - 从串行到 omp : no speedup

转载 作者:行者123 更新时间:2023-11-27 23:39:40 26 4
gpt4 key购买 nike

我是 openMP 的新手。我正在研究全对最短路径算法,这是我需要并行化的串行 C++ 例程(完整代码在文章末尾):

void mini(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
size_t i, j;

for ( i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);

}

在执行时我得到这个:

$ time ./floyd 

real 0m0,349s
user 0m0,349s
sys 0m0,000s

现在,我尝试插入一些指令:

void mini(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{

#pragma omp parallel
{
size_t i, j;

#pragma omp parallel for
for ( i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}

}

不幸的是,没有加速:

$ grep -c ^processor /proc/cpuinfo                                    
4
$ time ./floyd

real 0m0,547s
user 0m2,073s
sys 0m0,004s

我做错了什么?

编辑

处理器:Intel(R) Core(TM) i5-4590 CPU(4 个硬件核心)

完整代码:

#include <cstdio>
#include <vector>
#include <limits>
#include <ctime>
#include <random>
#include <set>
#include <omp.h>

using namespace std;

typedef struct Wedge
{
int a, b;
double w;
} Wedge;


typedef pair<int, int> edge;

int randrange(int end, int start=0)
{
random_device rd;
mt19937 gen(rd());
uniform_int_distribution<> dis(start, end-1);

return dis(gen);
}

void relax_omp(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
#pragma omp parallel
{
size_t i, j;

#pragma omp parallel for
for (i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);

}
}


void relax_serial(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
size_t i, j;


for (i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}


void floyd(vector<vector<double>> &dist, bool serial)
{
size_t i, k;

size_t n {dist.size()};

for (k=0; k<n; k++)
{
vector<double> rowk =dist[k];
vector<double> colk(n);

for (i=0; i<n; i++)
colk[i]=dist[i][k];
if (serial)
relax_serial(dist, n, rowk, colk);
else
relax_omp(dist, n, rowk, colk);
}

for (i=0; i<n; i++)
dist[i][i]=0;
}


vector<Wedge> random_edges(int n, double density, double max_weight)
{
int M{n*(n-1)/2};
double m{density*M};
set<edge> edges;
vector<Wedge> wedges;

while (edges.size()<m)
{
pair<int,int> L;
L.first=randrange(n);
L.second=randrange(n);

if (L.first!=L.second && edges.find(L) == edges.end())
{
double w=randrange(max_weight);
Wedge wedge{L.first, L.second, w};
wedges.push_back(wedge);
edges.insert(L);
}
}
return wedges;
}


vector<vector<double>> fill_distances(vector<Wedge> wedges, int n)
{
double INF = std::numeric_limits<double>::infinity();
size_t i, m=wedges.size();
vector<vector<double>> dist(n, vector<double>(n, INF));
int a, b;
double w;
for (i=0; i<m; i++)
{ a=wedges[i].a;
b=wedges[i].b;
w=wedges[i].w;
dist[a][b]=w;
}
return dist;
}


int main (void)
{
double density{0.33};
double max_weight{200};
int n{800};
bool serial;

int ntest=10;
double avge_serial=0, avge_omp=0;

for (int i=0; i<ntest; i++)
{
vector<Wedge> wedges=random_edges(n, density, max_weight);
vector<vector<double>> dist=fill_distances(wedges, n);
double dtime;

dtime = omp_get_wtime();
serial=true;
floyd(dist, serial);
dtime = omp_get_wtime() - dtime;
avge_serial+=dtime;


dtime = omp_get_wtime();
serial=false;
floyd(dist, serial);
dtime = omp_get_wtime() - dtime;
avge_omp+=dtime;
}
printf("%d tests, n=%d\n", ntest, n);
printf("Average serial : %.2lf\n", avge_serial/ntest);
printf("Average openMP : %.2lf\n", avge_omp/ntest);
return 0;
}

输出:

20 tests, n=800
Average serial : 0.31
Average openMP : 0.61

命令行:

g++ -std=c++11 -Wall -O2 -Wno-unused-result -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unused-parameter floyd.cpp -o floyd -lm -fopenmp

最佳答案

您的主要问题是您不小心使用了嵌套并行性:

#pragma omp parallel
{
size_t i, j;

#pragma omp parallel for

因为你已经在一个平行区域,你的第二行应该是

    #pragma omp for

否则,由于 omp parallel for 等于 omp parallelomp for,您有两个嵌套的并行区域,这通常很糟糕.修复这个小问题可以在类似的 CPU 上获得大约 2 倍的加速。

有几个限制因素导致您不太可能获得完整的 4 倍加速,例如但不限于:

  • 内存带宽成为瓶颈
  • 由于在并行循环中完成的工作量较小而导致的相对开销
  • 在 turbo 模式下使用多线程降低时钟频率

编辑:顺便说一下,编写代码的更惯用的方法如下:

void relax_omp(...) {
#pragma omp parallel for
for (size_t i=0; i<n; i++) {
for (size_t j=0; j<n; j++) {
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
}
}

如果您尽可能在本地声明变量,OpenMP 几乎总是会做正确的事情。在这种情况下,这意味着 ij 是私有(private)的。通常,以这种方式推理代码要容易得多。

关于c++ - 从串行到 omp : no speedup,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56382895/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com