clang - LLVM 如何通过运行时库调用转换 OpenMP 多线程代码？-6ren

clang - LLVM 如何通过运行时库调用转换 OpenMP 多线程代码？

转载作者：行者123 更新时间：2023-12-02 01:45:23

当我研究LLVM OpenMP Runtime Library document时，我发现有一个关于工作共享的例子:

extern float foo( void );
int main () {
    int i;
    float r = 0.0;
    #pragma omp parallel for schedule(dynamic) reduction(+:r)
    for ( i = 0; i < 10; i ++ ) {
        r += foo();
    }
}

然后它显示转换后的代码，如下所示:

extern float foo( void );
int main () {
    static int zero = 0;
    auto int gtid;
    auto float r = 0.0;
    __kmpc_begin( & loc3, 0 );
    // The gtid is not actually required in this example so could be omitted;
    // We show its initialization here because it is often required for calls into
    // the runtime and should be locally cached like this.
    gtid = __kmpc_global thread num( & loc3 );
    __kmpc_fork call( & loc7, 1, main_7_parallel_3, & r );
    __kmpc_end( & loc0 );
    return 0;
}

struct main_10_reduction_t_5 { float r_10_rpr; };

static kmp_critical_name lck = { 0 };
static ident_t loc10; // loc10.flags should contain KMP_IDENT_ATOMIC_REDUCE bit set
                      // if compiler has generated an atomic reduction.
void main_7_parallel_3( int *gtid, int *btid, float *r_7_shp ) {
    auto int i_7_pr;
    auto int lower, upper, liter, incr;
    auto struct main_10_reduction_t_5 reduce;
    reduce.r_10_rpr = 0.F;
    liter = 0;
    __kmpc_dispatch_init_4( & loc7,*gtid, 35, 0, 9, 1, 1 );
    while ( __kmpc_dispatch_next_4( & loc7, *gtid, & liter, & lower, & upper, & incr
      ) ) {
        for( i_7_pr = lower; upper >= i_7_pr; i_7_pr ++ )
          reduce.r_10_rpr += foo();
    }
    switch( __kmpc_reduce_nowait( & loc10, *gtid, 1, 4, & reduce, main_10_reduce_5, &lck ) ) {
        case 1:
           *r_7_shp += reduce.r_10_rpr;
           __kmpc_end_reduce_nowait( & loc10, *gtid, & lck );
           break;
        case 2:
           __kmpc_atomic_float4_add( & loc10, *gtid, r_7_shp, reduce.r_10_rpr );
           break;
        default:;
    }
}

我花了很多时间寻找 OpenMP 如何转换上面的代码，但仍然找不到像示例那样显示结果的方法，以及它如何在 OpenMP 中工作。

所以，这是我困惑了很长时间的问题:有没有办法像示例一样输出文件或直接显示结果？

最佳答案

您可以检查 LLVM IR(请参阅 https://llvm.org/docs/LangRef.html)。

例如:

clang -fopenmp -O2 -emit-llvm -S -o - example.c

将以下内容打印到标准输出:

[...]
; Function Attrs: nounwind uwtable
define dso_local i32 @main() local_unnamed_addr #0 {
entry:
  %i = alloca i32, align 4
  %r = alloca float, align 4
  %0 = bitcast i32* %i to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
  %1 = bitcast float* %r to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4
  store float 0.000000e+00, float* %r, align 4, !tbaa !2
  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @0, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, float*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull %i, float* nonnull %r) #4
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
  ret i32 0
}
[...]

; Function Attrs: norecurse nounwind uwtable
define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* nocapture readnone dereferenceable(4) %i, float* nocapture dereferenceable(4) %r) #2 {
entry:
  %.omp.lb = alloca i32, align 4
  %.omp.ub = alloca i32, align 4
  %.omp.stride = alloca i32, align 4
  %.omp.is_last = alloca i32, align 4
  %r1 = alloca float, align 4
  %.omp.reduction.red_list = alloca [1 x i8*], align 8
  %0 = bitcast i32* %.omp.lb to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
  store i32 0, i32* %.omp.lb, align 4, !tbaa !6
  %1 = bitcast i32* %.omp.ub to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4
  store i32 9, i32* %.omp.ub, align 4, !tbaa !6
  %2 = bitcast i32* %.omp.stride to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #4
  store i32 1, i32* %.omp.stride, align 4, !tbaa !6
  %3 = bitcast i32* %.omp.is_last to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4
  store i32 0, i32* %.omp.is_last, align 4, !tbaa !6
  %4 = bitcast float* %r1 to i8*
  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %4) #4
  store float 0.000000e+00, float* %r1, align 4, !tbaa !2
  %5 = load i32, i32* %.global_tid., align 4, !tbaa !6
  tail call void @__kmpc_dispatch_init_4(%struct.ident_t* nonnull @0, i32 %5, i32 35, i32 0, i32 9, i32 1, i32 1) #4
  %6 = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* nonnull @0, i32 %5, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride) #4
  %tobool14 = icmp eq i32 %6, 0
  br i1 %tobool14, label %omp.dispatch.end, label %omp.dispatch.body

omp.dispatch.cond.loopexit:                       ; preds = %omp.inner.for.body, %omp.dispatch.body
  %7 = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* nonnull @0, i32 %5, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride) #4
  %tobool = icmp eq i32 %7, 0
  br i1 %tobool, label %omp.dispatch.end, label %omp.dispatch.body

omp.dispatch.body:                                ; preds = %entry, %omp.dispatch.cond.loopexit
  %8 = load i32, i32* %.omp.lb, align 4, !tbaa !6
  %9 = load i32, i32* %.omp.ub, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
  %cmp12 = icmp sgt i32 %8, %9
  br i1 %cmp12, label %omp.dispatch.cond.loopexit, label %omp.inner.for.body

omp.inner.for.body:                               ; preds = %omp.dispatch.body, %omp.inner.for.body
  %.omp.iv.013 = phi i32 [ %add4, %omp.inner.for.body ], [ %8, %omp.dispatch.body ]
  %call = call float @foo() #4, !llvm.mem.parallel_loop_access !8
  %10 = load float, float* %r1, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !8
  %add3 = fadd float %call, %10
  store float %add3, float* %r1, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !8
  %add4 = add nsw i32 %.omp.iv.013, 1
  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !6, !llvm.mem.parallel_loop_access !8
  %cmp = icmp slt i32 %.omp.iv.013, %11
  br i1 %cmp, label %omp.inner.for.body, label %omp.dispatch.cond.loopexit, !llvm.loop !8

omp.dispatch.end:                                 ; preds = %omp.dispatch.cond.loopexit, %entry
  %12 = bitcast [1 x i8*]* %.omp.reduction.red_list to float**
  store float* %r1, float** %12, align 8
  %13 = bitcast [1 x i8*]* %.omp.reduction.red_list to i8*
  %14 = call i32 @__kmpc_reduce_nowait(%struct.ident_t* nonnull @1, i32 %5, i32 1, i64 8, i8* nonnull %13, void (i8*, i8*)* nonnull @.omp.reduction.reduction_func, [8 x i32]* nonnull @.gomp_critical_user_.reduction.var) #4
  switch i32 %14, label %.omp.reduction.default [
    i32 1, label %.omp.reduction.case1
    i32 2, label %.omp.reduction.case2
  ]

.omp.reduction.case1:                             ; preds = %omp.dispatch.end
  %15 = load float, float* %r, align 4, !tbaa !2
  %16 = load float, float* %r1, align 4, !tbaa !2
  %add5 = fadd float %15, %16
  store float %add5, float* %r, align 4, !tbaa !2
  call void @__kmpc_end_reduce_nowait(%struct.ident_t* nonnull @1, i32 %5, [8 x i32]* nonnull @.gomp_critical_user_.reduction.var) #4
  br label %.omp.reduction.default

.omp.reduction.case2:                             ; preds = %omp.dispatch.end
  %17 = bitcast float* %r to i32*
  %atomic-load = load atomic i32, i32* %17 monotonic, align 4, !tbaa !2
  %18 = load float, float* %r1, align 4, !tbaa !2
  br label %atomic_cont

atomic_cont:                                      ; preds = %atomic_cont, %.omp.reduction.case2
  %19 = phi i32 [ %atomic-load, %.omp.reduction.case2 ], [ %23, %atomic_cont ]
  %20 = bitcast i32 %19 to float
  %add7 = fadd float %18, %20
  %21 = bitcast float %add7 to i32
  %22 = cmpxchg i32* %17, i32 %19, i32 %21 monotonic monotonic
  %23 = extractvalue { i32, i1 } %22, 0
  %24 = extractvalue { i32, i1 } %22, 1
  br i1 %24, label %.omp.reduction.default, label %atomic_cont

.omp.reduction.default:                           ; preds = %atomic_cont, %.omp.reduction.case1, %omp.dispatch.end
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %4) #4
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #4
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4
  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
  ret void
}
[...]

关于clang - LLVM 如何通过运行时库调用转换 OpenMP 多线程代码？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/52285368/

文章推荐： c++ - 如何让 cpack 生成正确的开始菜单快捷方式

文章推荐： regex - Perl:从字符串中提取多个数字

文章推荐： android - Google 登录错误 12500

css - 如何在CSS中用一种颜色绘制一半对 Angular 线，用另一种颜色绘制另一半对 Angular 线
我将 Bootstrap 与 css 和 java 脚本结合使用。在不影响前端代码的情况下，我真的很难在css中绘制这个背景。在许多问题中，人们将宽度和高度设置为 0%。但是由于我的导航栏，我不能使用
c - scanf ("%79[^\n]",line); 之间的区别vs scanf ("%79[^\n]\n"，线) vs scanf ("%79[^\n]s"，线)
我正在用 c 编写一个程序来读取文件的内容。代码如下: #include void main() { char line[90]; while(scanf("%79[^\
Javascript获取矩阵数组的对 Angular 线
我想使用 javascript 获取矩阵数组的所有对 Angular 线。假设输入输出如下: input = [ [1,2,3], [4,5,6], [7,8,9], ] output =
pdfmake - 线，圆和其他形状？
可以用pdfmake绘制lines，circles和other shapes吗？如果是，是否有documentation或样本？我想用jsPDF替换pdfmake。最佳答案是的，有可能。 pdfm
svg - 如何制作带有边框的SVG“线”？
我有一个小svg小部件，其目的是显示角度列表（参见图片）。现在，角度是线元素，仅具有笔触，没有填充。但是现在我想使用一种“内部填充”颜色和一种“笔触/边框”颜色。我猜想line元素不能解决这个问题，
java - 线-三角形交点检查返回错误的交点
我正在为带有三角对象的 3D 场景编写一个非常基本的光线转换器，一切都工作正常，直到我决定尝试从场景原点 (0/0/0) 以外的点转换光线。但是，当我将光线原点更改为 (0/1/0) 时，相交测试突
Python Shebang 线
这个问题已经有答案了: Why do people write "#!/usr/bin/env python" on the first line of a Python script? (22 个回
java - 如何在屏幕上绘制星号(*)线？
如何使用大约 50 个星号 * 并使用 for 循环绘制一条水平线？当我尝试这样做时，结果是垂直(而不是水平)列出 50 个星号。 public void drawAstline() { f
java - 球不动；线？
这是一个让球以对角线方式下降的 UI，但球保持静止；线程似乎无法正常工作。你能告诉我如何让球移动吗？请下载一个球并更改目录，以便程序可以找到您的球的分配位置。没有必要下载足球场，但如果您愿意，也可以
ant - Jmeter和Ant报90%线
我在我的一个项目中使用 Jmeter 和 Ant，当我们生成报告时，它会在报告中显示 URL、#Samples、失败、成功率、平均时间、最短时间、最长时间。我也想在报告中包含 90% 的时间线。现
Java Swing 线
我有一个不寻常的问题，希望有人能帮助我。我想用 Canvas (android) 画一条 Swing 或波浪线，但我不知道该怎么做。它将成为蝌蚪的尾部，所以理想情况下我希望它的形状更像三角形，一端更大
javafx粗笔画路径(线)与圆之间的碰撞问题
这个问题已经有答案了: Checking Collision of Shapes with JavaFX (1 个回答) 已关闭 8 年前。我正在使用 JavaFx 8 库。我的任务很简单:我想检
bash - 如何按编号的百分比拆分文件。线？
如何按编号的百分比拆分文件。行数？假设我想将我的文件分成 3 个部分(60%/20%/20% 部分)，我可以手动执行此操作，-_-: $ wc -l brown.txt 57339 brown.tx
圆圈之间的 HTML 线
我正在努力实现这样的目标: 但这就是我设法做到的。你能帮我实现预期的结果吗？更新: 如果我删除 bootstrap.css 依赖项，问题就会消失。我怎样才能让它与 Bootstrap 一起工作？
css - 转换比例导致间隙/线
我目前正在构建一个网站，但遇到了 transform: scale 的问题。我有一个按钮，当用户将鼠标悬停在它上面时，会发生两件事: 背景以对 Angular 线“扫过” 按钮标签颜色改变按钮稍微变
macos - 非常快速地绘制图形(线)
我需要使用直线和仿射变换绘制大量数据点的图形(缩放图形以适合 View )。目前，我正在使用 NSBezierPath，但我认为它效率很低(因为点在绘制之前被复制到贝塞尔路径)。通过将我的数据切割成
image - 从车牌上删除多余的像素/线
我正在使用基于 SVM 分类的 HOG 特征检测器。我可以成功提取车牌，但提取的车牌除了车牌号外还有一些不必要的像素/线。我的图像处理流程如下: 在灰度图像上应用 HOG 检测器裁剪检测到的区域调
python - 间隙填充轮廓/线
我有以下图片: 我想填充它的轮廓(即我想在这张图片中填充线条)。我尝试了形态学闭合，但使用大小为 3x3 的矩形内核和 10 迭代并没有填满整个边界。我还尝试了一个 21x21 内核和 1 迭代，但
algorithm - 统计两组序列的交点(线)
我必须找到一种算法，可以找到两组数组之间的交集总数，而其中一个数组已排序。举个例子，我们有这两个数组，我们向相应的数字画直线。这两个数组为我们提供了总共 7 个交集。有什么样的算法可以帮助我解决
c++ - 如何从带有透视投影的近裁剪平面绘制射线/线？
简单地说 - 我想使用透视投影从近裁剪平面绘制一条射线/线到远裁剪平面。我有我认为是使用各种 OpenGL/图形编程指南中描述的方法通过单击鼠标生成的正确标准化的世界坐标。我遇到的问题是我的光线似乎

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

clang - LLVM 如何通过运行时库调用转换 OpenMP 多线程代码？