gpt4 book ai didi

c++ - OpenMP 实现比串行实现慢

转载 作者:行者123 更新时间:2023-11-28 05:08:24 24 4
gpt4 key购买 nike

<分区>

我目前正在尝试熟悉 OpenMP。为了练习,我用 OpenMP 实现了一个贪婪的“学习”算法。然后我用

测量了时间
time ./a.out

我与我的串行实现进行了比较,无论我的程序执行多少次迭代,OpenMP 总是慢得多。

这是我的代码,评论应该能解释一切:

#include <omp.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <cmath>
#include <stdio.h>
#include <ctime>

#define THREADS 4

using namespace std;

struct TrainData {
double input;
double output;
};

//Long Term Memory struct
struct LTM {
double a; //paramter a of the polynom
double b;
double c;
double score; //score to be minimized!

LTM()
{
a=0;
b=0;
c=0;
score=0;
}

//random LTM with paramters from low to high (including low and high)
LTM(int low, int high)
{
score=0;
a= rand() % high + low;
b= rand() % high + low;
c= rand() % high + low;

}

LTM(double _a, double _b, double _c)
{
a=_a;
b=_b;
c=_c;
}

void print()
{
cout<<"Score: "<<score<<endl;
cout<<"a: "<<a<<" b: "<<b<<" c: "<<c<<endl;
}
};

//the acutal polynom function evaluating with passed LTM
inline double evaluate(LTM &ltm, const double &x)
{
double ret;
ret = ltm.a*x*x + ltm.b*x + ltm.c;

return ret;
}


//scoring function calculates the Root Mean Square error (RMS)
inline double score_function(LTM &ltmnew, vector<TrainData> &td)
{
double score;
double val;
int tdsize=td.size();
score=0;

for(int i=0; i< tdsize; i++)
{
val = (td.at(i)).output - evaluate(ltmnew, (td.at(i)).input);
val *= val;
score += val;
}

score /= (double)tdsize;

score = sqrt(score);

return score;
}

LTM iterate(int iterations, vector<TrainData> td, int low, int high)
{
LTM fav = LTM(low,high);
fav.score = score_function(fav, td);
fav.print();
LTM favs[THREADS]; // array for collecting the favorites of each thread

#pragma omp parallel num_threads(THREADS) firstprivate(fav, low, high, td)
{
#pragma omp master
printf("Threads: %d\n", omp_get_num_threads());

LTM cand;
#pragma omp for private(cand)
for(int i=0; i<iterations; i++)
{
cand = LTM(low, high);
cand.score = score_function(cand, td);

if(cand.score < fav.score)
fav = cand;
}

//save the favorite before ending the parallel section
#pragma omp critical
favs[omp_get_thread_num()] = fav;
}

//search for the best one in the array
for(int i=0; i<THREADS; i++)
{
if(favs[i].score < fav.score)
fav=favs[i];
}

return fav;
}

//generate training data from -50 up to 50 with the train LTM
void generateTrainData(vector<TrainData> *td, LTM train)
{
#pragma omp parallel for schedule(dynamic, 25)
for(int i=-50; i< 50; i++)
{
struct TrainData d;
d.input = i;
d.output = evaluate(train, (double)i);
#pragma omp critical
td->push_back(d);

//cout<<"input: "<<d.input<<" -> "<<d.output<<endl;
}

}

int main(int argc, char *argv[])
{

int its= 10000000; //number of iterations
int a=2;
int b=4;
int c=6;

srand(time(NULL));
LTM pol = LTM(a,b,c); //original polynom parameters
vector<TrainData> td;

//first genarte some training data and save it to td
generateTrainData(&td, pol);

//try to find the best solution
LTM fav = iterate( its, td, 1, 6);


printf("Final: a=%f b=%f c=%f score: %f\n", fav.a, fav.b, fav.c, fav.score);

return 0;
}

在我的家用 PC 上,这个实现花费了 12 秒。序列号只有6s。如果我将迭代次数增加 10 倍,则大约为 2 分钟/1 分钟(omp/serial)。

谁能帮帮我?

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com