c - liblsoda : Using OpenMP with gcc to thread LSODA ODE solver in C-6ren

c - liblsoda : Using OpenMP with gcc to thread LSODA ODE solver in C

转载作者：行者123 更新时间：2023-11-30 20:09:27

关闭。这个问题需要details or clarity .它目前不接受答案。

想改进这个问题？通过 editing this post 添加详细信息并澄清问题.

3年前关闭。

Improve this question

我正在尝试使用 OpenMP 线程化可重入算法，但成功有限。 github 上有 FORTRAN LSODA 例程的 C 版本。它用于求解一阶常微分方程。它经历了几个版本，但最新的版本是 Simon Frost 的，可以在这里找到:

https://github.com/sdwfrost/liblsoda

该库附带一个简单的测试示例，我使用 OpenMP 将其更新为线程。关于并行实现，我有几个悬而未决的问题。我最“成功”的尝试如下(解决所用的时间没有减少，解决方案有时会有所不同):

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "common.h"
#include "lsoda.h"
#include "omp.h"

#define BILLION 1000000000L

#define NUM_THREADS 2       // Number of threads for OpenMP

struct timespec start, end;
long long int timeValues[2];    
long long unsigned int deltaTimeArray;  
double msTime = 0.;

int fex(double t, double *y, double *ydot, void *data)
{
   ydot[0] = 1.0E4 * y[1] * y[2] - .04E0 * y[0];
   ydot[2] = 3.0E7 * y[1] * y[1];
   ydot[1] = -1.0 * (ydot[0] + ydot[2]);

   return(0);
}

int test(void)
{
   # ifdef _OPENMP
   printf("Compiled by an OpenMP-compliant implementation.\n");
   # endif

   // Begin timing the algorithm
   clock_gettime(CLOCK_MONOTONIC, &start);
   timeValues[0] = (BILLION*start.tv_sec) + start.tv_nsec;

   omp_set_dynamic(0);
   omp_set_num_threads(NUM_THREADS);

   int nThreads = 0;

   #pragma omp parallel
   {
      double  atol[3], rtol[3], t, tout, y[3];
      int     neq = 3;
      int     iout;

      y[0] = 1.0E0;
      y[1] = 0.0E0;
      y[2] = 0.0E0;

      t = 0.0E0;
      tout = 0.4E0;

      struct lsoda_opt_t opt = {0};
      opt.ixpr = 0;
      opt.rtol = rtol;
      opt.atol = atol;
      opt.itask = 1;

      rtol[0] = rtol[2] = 1.0E-4;
      rtol[1] = 1.0E-4;
      atol[0] = 1.0E-6;
      atol[1] = 1.0E-10;
      atol[2] = 1.0E-6;

      struct lsoda_context_t ctx = {
        .function = fex,
        .neq = neq,
        .data = NULL,
        .state = 1,
      };

      lsoda_prepare(&ctx, &opt);

      #pragma omp master
      nThreads = omp_get_num_threads();

      #pragma omp for
      for (iout = 1; iout <= 12; iout++)
      {
         lsoda(&ctx, y, &t, tout);

         printf(" at t= %12.4e y= %14.6e %14.6e %14.6e\n", t, y[0], y[1], y[2]);

         if (ctx.state <= 0) 
         {
            printf("error istate = %d\n", ctx.state);
            exit(0);
         }
/*
         if (iout == 1) tout = 4.0E-1 * 10.0E0;
         if (iout == 2) tout = 4.0E0 * 10.0E0;
         if (iout == 3) tout = 4.0E1 * 10.0E0;
         if (iout == 4) tout = 4.0E2 * 10.0E0;
         if (iout == 5) tout = 4.0E3 * 10.0E0;
         if (iout == 6) tout = 4.0E4 * 10.0E0;
         if (iout == 7) tout = 4.0E5 * 10.0E0;
         if (iout == 8) tout = 4.0E6 * 10.0E0;
         if (iout == 9) tout = 4.0E7 * 10.0E0;
         if (iout == 10) tout = 4.0E8 * 10.0E0;
         if (iout == 11) tout = 4.0E9 * 10.0E0;
         if (iout == 12) tout = 4.0E10 * 10.0E0;
*/
         tout = tout * 10.0E0;
      }

      lsoda_free(&ctx);
   }

   if  (nThreads == NUM_THREADS) 
   {
      printf("The expected number of threads, %d, were used.\n", NUM_THREADS);
   }
   else 
   {
      printf("Expected %d OpenMP threads, but %d were used.\n", NUM_THREADS, nThreads);
   }

   // End timing the algorithm
   clock_gettime(CLOCK_MONOTONIC, &end);
   timeValues[1] = (BILLION*end.tv_sec) + end.tv_nsec;

   deltaTimeArray = timeValues[1] - timeValues[0];
   timeValues[0] = timeValues[1];
   msTime = deltaTimeArray * pow(10, -6);

   const char *elapsed = "elapsed";

   printf("%13s\n%13.06lf\n", elapsed, msTime);

   return(0);
}

int main(void) 
{
   for(int i = 0; i < 1; i++) 
   {
      test();
   }

   return(0);
}

/*
 The correct answer (up to certain precision):

 at t=   4.0000e-01 y=   9.851712e-01   3.386380e-05   1.479493e-02
 at t=   4.0000e+00 y=   9.055333e-01   2.240655e-05   9.444430e-02
 at t=   4.0000e+01 y=   7.158403e-01   9.186334e-06   2.841505e-01
 at t=   4.0000e+02 y=   4.505250e-01   3.222964e-06   5.494717e-01
 at t=   4.0000e+03 y=   1.831976e-01   8.941773e-07   8.168015e-01
 at t=   4.0000e+04 y=   3.898729e-02   1.621940e-07   9.610125e-01
 at t=   4.0000e+05 y=   4.936362e-03   1.984221e-08   9.950636e-01
 at t=   4.0000e+06 y=   5.161833e-04   2.065787e-09   9.994838e-01
 at t=   4.0000e+07 y=   5.179804e-05   2.072027e-10   9.999482e-01
 at t=   4.0000e+08 y=   5.283675e-06   2.113481e-11   9.999947e-01
 at t=   4.0000e+09 y=   4.658667e-07   1.863468e-12   9.999995e-01
 at t=   4.0000e+10 y=   1.431100e-08   5.724404e-14   1.000000e+00
*/

似乎“omp.h”有几种变体，我使用的版本可以在这里找到:

https://sites.uclouvain.be/SystInfo/usr/include/omp.h.html

我不确定这个算法是如何被线程化的。第一个悬而未决的问题是调用 lsoda() 例程的 for 循环是否应该被线程化，因为 tout 具有循环依赖性。如果它是一个增量(在我的实际实现中就是这种情况)，它可以简单地使用逗号运算符添加为 for 循环中的第二个增量语句(编辑:显然这不能用 OpenMP 完成 - for 循环必须是规范的形式并且不能有两个增量表达式)。这就是注释掉的代码部分根据 iout 的值更新 tout 的原因。即使使用那个笨重的代码块有时也会导致错误的解决方案。

所提供的测试示例是使用嵌入在 for 循环中的 test() 函数编写的，并在 main 中调用(超过一次迭代)。我不确定这背后的逻辑，但就我而言，我正在做类似的事情，在 main 的 for 循环中多次调用 test() 例程。线程化 for 循环的问题再次是循环依赖性，其中每个连续的解决方案(y[] 值)都依赖于前一个解决方案。我知道这个算法能够被线程化，但我似乎无法成功实现它。任何指针将不胜感激。

最佳答案

It seems there are several variants of "omp.h" and the version I used can be found here: [...]

不，您不能随意选择一些随机的 OMP header 。您应该使用您正在使用的 OMP 实现提供的那个，否则您将面临未定义行为的风险。

I'm not sure how this algorithm is meant to be threaded. The first outstanding question is whether or not the for loop that calls the lsoda( ) routine should be threaded since tout has a loop dependency.

您担心依赖性是正确的。 OpenMP 对此没有什么魔力——程序员负责处理依赖关系。然而，在这种情况下，您有可行的选择来打破涉及 tout 的依赖关系。，他们之中:

预计算 tout 的数组值并传递该数组的元素:

double touts[12] = { 0.4 };

for (int i = 1; i < 12; i++) touts[i] = 10 * touts[i - 1];

// ...

lsoda(&ctx, y, &t, touts[i - 1]);

计算适当的tout在函数调用时:

lsoda(&ctx, y, &t, tout * pow(10, i - 1));
// ... and avoid modifying tout later in the loop ...

但是，请注意其他依赖项。特别是，如果 lsoda()两者都读取和修改其其他参数指向的数据，然后可能会引入更难(也许是不可能)处理的额外依赖关系。

您似乎是在说确实如此:

The problem with threading that for loop is again loop dependency, in which each successive solution (the y[ ] values) is dependent on the previous solution.

如果 lsoda()计算并存储 y 的元素的新值，而那些以难以预测的方式依赖于原始值，那么游戏就结束了。你不会打破这种依赖关系，如果不打破它，你最好的结果是你得到正确的输出而没有任何加速。并不是说您可以放心地依靠看到最好的情况。

I know that this algorithm is capable of being threaded, but I can't seem to successfully implement it.

并非所有算法都是可并行的。您可以使用 OpenMP 和 lsoda() 是合理的。同时解决几个单独的问题，可以想象 lsoda()可以在内部并行化(我没有评估这种可能性)，但我认为没有理由认为您的特定测试用例可以有效地并行化。

关于c - liblsoda : Using OpenMP with gcc to thread LSODA ODE solver in C，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/52119279/

文章推荐： javascript - 如何使用 Jquery 在 angular 5 上触发点击事件

文章推荐： c# - 如何从 C# 更改 ApplicationBar 上的图像？

文章推荐： javascript - Pycurl 脚本

openmp - OpenMP 中的高斯消除
OpenMP 中的高斯消除。我是 openmp 的新手，想知道我是否在正确的地方使用了我的编译指示和屏障。我的 x 值每次都不同。他们应该是一样的吗？？ #include int num; doub
openmp - OpenMP 和矢量化之间的比较
给定一个示例函数(示例在下面给出)，for 循环可以使用 OpenMP 并行化或使用矢量化进行矢量化(假设编译器执行矢量化)。示例 void function(float* a, float* b,
openmp - OpenMP 中的原子性和关键性有什么区别？
OpenMP 中原子和关键之间有什么区别？我能做到 #pragma omp atomic g_qCount++; 但这和不一样吗 #pragma omp critical g_qCount++; ？
openmp - 给定依赖图生成 OpenMP 代码
我有一个关于如何在您考虑特定依赖关系图时生成 OpenMP 伪代码的问题。所以假设我们有这个特定的图表: 解决方案可能是这样的: #pragma omp parallel {
openmp - 使用 OpenMP 进行缩减
我正在尝试使用 openmp 计算二维矩阵的平均值。这个二维矩阵实际上是一个图像。我正在对数据进行线程分割。例如，如果我有 N线程比我处理行/N thread0 的行数，等等。我的问题是:我可以
openmp - 如何统计测量程序中的 OpenMP 性能？
我想统计测量与 OpenMP 并行化的程序的性能。我选择在执行并行算法的测试应用程序中编写循环 MAX_EXPERIMENTS次并将时间测量报告到文件中。问题解决方案似乎比提取外部循环上方的并行编译
openmp - OpenMP 中的循环顺序折叠性能建议
我找到了 Intel's performance suggestion on Xeon Phi关于 OpenMP 中的 Collapse 子句。 #pragma omp parallel for co
openmp - 如何使用 OpenMP 并行化数组移位？
如何使用 OpenMP 并行化数组移位？我尝试了一些方法，但在以下示例中没有得到任何准确的结果(该示例旋转 Carteira 对象数组的元素，用于排列算法): void rotaciona(int
openmp - 使用 openmp 并行执行函数
我有一系列对几个独立函数的调用。 func1(arg); func2(arg); func3(arg); 我想并行执行它们，而不是串行执行它们。我目前正在使用 #pragma omp parallel
openmp - openmp 中的 Dependent 子句不尊重声明的依赖
我正在尝试使用 openmp 任务来安排基本 jacobi2d 计算的平铺执行。在 jacobi2d 中，依赖于 A(i,j) 从 A(i, j) A(i-1, j) A(i+1, j) A(i, j
openmp - 在 OpenMP 中，如何让每个内核运行一个线程？
我在 3 天前开始使用 OpenMP。我想知道如何使用#pragma使每个内核运行一个线程。详细信息:- int ncores = omp_get_num_procs();for(i = 0; i <
openmp - OpenMP 中的 Schedule 子句
我有一段代码(它是应用程序的一部分)，我正在尝试使用 OpenMP 对其进行优化，正在尝试各种调度策略。就我而言，我注意到 schedule(RUNTIME)条款比其他条款有优势(我没有指定 chun
openmp - OpenMP 或 MPI 哪个更容易学习和调试？
我有一个数字运算 C/C++ 应用程序。它基本上是不同数据集的主循环。我们可以使用 openmp 和 mpi 访问一个 100 节点的集群。我想加速应用程序，但我是 mpi 和 openmp 的绝对新
openmp - OpenMP 分发中的 SECTIONS 指令如何工作？
在 OpenMP 中使用ompsections时，线程会被分配到sections内的 block ，还是每个线程会被分配到每个section？当nthreads == 3时: #pragma omp
openmp - cython openmp 单，屏障
我正在尝试在 cython 中使用 openmp。我需要在 cython 中做两件事: i) 在我的 cython 代码中使用 #pragma omp single{} 作用域。 ii) 使用#pra
openmp - 为什么 OpenMP 不能在 for 循环内有部分？
我正在尝试通过将循环的每次迭代作为 OpenMP 部分来并行化 OpenMP 中基于范围的 for 循环。我想这样做: #pragma omp parallel sections { for ( au
openmp - cython openmp 单，屏障
我正在尝试在 cython 中使用 openmp。我需要在 cython 中做两件事: i) 在我的 cython 代码中使用 #pragma omp single{} 作用域。 ii) 使用#pra
openmp - 将并行程序转换为集群程序。从 OpenMP 到？
我想编写一个代码转换器，它采用基于 OpenMP 的并行程序并在集群上运行它。我该如何解决这个问题？我使用哪些库？如何为此设置小型集群？我发现很难在 Internet 上找到有关集群计算的好 Ma
c++ - OpenMP - OpenMP 'for' 语句中的索引变量必须具有带符号的整数类型
我是 OpenMP 的新手。我正在尝试为 for 循环使用多个内核，但出现此编译错误: “错误 C3016:'x':OpenMP 'for' 语句中的索引变量必须具有带符号的整数类型”。我知道 Op
openmp - 使用 Qt creator 时如何开启 OpenMP
如果我使用 VS 2010 编译器从 Qt Creator 构建项目，我如何启用 OpenMP(从 Visual Studio 构建时，您只需启用该功能)谢谢最佳答案在 .pro 文件中尝试下一步

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c - liblsoda : Using OpenMP with gcc to thread LSODA ODE solver in C