gpt4 book ai didi

c++ - OpenMP 中的并行合并排序

转载 作者:塔克拉玛干 更新时间:2023-11-02 23:21:58 27 4
gpt4 key购买 nike

我在 this 中看到了并行合并排序算法纸。这是代码:

void mergesort_parallel_omp (int a[], int size, int temp[], int threads) 
{
if ( threads == 1) { mergesort_serial(a, size, temp); }
else if (threads > 1)
{
#pragma omp parallel sections
{
#pragma omp section
mergesort_parallel_omp(a, size/2, temp, threads/2);
#pragma omp section
mergesort_parallel_omp(a + size/2, size - size/2, temp + size/2, threads - threads/2);
}
merge(a, size, temp);
} // threads > 1
}

我在多核上运行它。发生的情况是,在树的叶子上,有 2 个线程并行运行。他们完成工作后,另外 2 个线程启动,依此类推。即使我们有所有叶节点的空闲核心。

我认为原因是此 OpenMP 代码不会在并行区域内创建并行区域。我说得对吗?

最佳答案

I think the reason is that OpenMP cannot create parallel regionsinside parallel regions

你可以有一个平行区域的平行区域。

OpenMP parallel regions can be nested inside each other. If nestedparallelism is disabled, then the new team created by a threadencountering a parallel construct inside a parallel region consistsonly of the encountering thread. If nested parallelism is enabled,then the new team may consist of more than one thread (source).

为了正确运行您的代码,您需要调用 omp_set_nested(1)omp_set_num_threads(2)

Nested parallelism can be enabled or disabled by setting theOMP_NESTED environment variable or calling omp_set_nested() function


为了获得更好的性能而不是部分,您可以使用 OpenMP 任务(详细信息和示例可以在 here 中找到),如下所示:

void merge(int * X, int n, int * tmp) {
...
}

void mergeSort(int *X, int n, int *tmp)
{
if (n < 2) return;

#pragma omp task shared(X) if (n > TASK_SIZE)
mergeSort(X, n/2, tmp);

#pragma omp task shared(X) if (n > TASK_SIZE)
mergeSort(X+(n/2), n-(n/2), tmp + n/2);

#pragma omp taskwait
mergeSortAux(X, n, tmp);
}



int main()
{
...
#pragma omp parallel
{
#pragma omp single
mergesort(data, n, tmp);
}
}

merge算法的时序代码来自Dr. Johnnie W. Baker webpage.。但是,我在此答案中提供的代码有一些更正和性能改进。

一个完整的运行示例:

#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

#define TASK_SIZE 100

unsigned int rand_interval(unsigned int min, unsigned int max)
{
// https://stackoverflow.com/questions/2509679/
int r;
const unsigned int range = 1 + max - min;
const unsigned int buckets = RAND_MAX / range;
const unsigned int limit = buckets * range;

do
{
r = rand();
}
while (r >= limit);

return min + (r / buckets);
}

void fillupRandomly (int *m, int size, unsigned int min, unsigned int max){
for (int i = 0; i < size; i++)
m[i] = rand_interval(min, max);
}

void mergeSortAux(int *X, int n, int *tmp) {
int i = 0;
int j = n/2;
int ti = 0;

while (i<n/2 && j<n) {
if (X[i] < X[j]) {
tmp[ti] = X[i];
ti++; i++;
} else {
tmp[ti] = X[j];
ti++; j++;
}
}
while (i<n/2) { /* finish up lower half */
tmp[ti] = X[i];
ti++; i++;
}
while (j<n) { /* finish up upper half */
tmp[ti] = X[j];
ti++; j++;
}
memcpy(X, tmp, n*sizeof(int));
}

void mergeSort(int *X, int n, int *tmp)
{
if (n < 2) return;

#pragma omp task shared(X) if (n > TASK_SIZE)
mergeSort(X, n/2, tmp);

#pragma omp task shared(X) if (n > TASK_SIZE)
mergeSort(X+(n/2), n-(n/2), tmp + n/2);

#pragma omp taskwait
mergeSortAux(X, n, tmp);
}

void init(int *a, int size){
for(int i = 0; i < size; i++)
a[i] = 0;
}

void printArray(int *a, int size){
for(int i = 0; i < size; i++)
printf("%d ", a[i]);
printf("\n");
}

int isSorted(int *a, int size){
for(int i = 0; i < size - 1; i++)
if(a[i] > a[i + 1])
return 0;
return 1;
}

int main(int argc, char *argv[]) {
srand(123456);
int N = (argc > 1) ? atoi(argv[1]) : 10;
int print = (argc > 2) ? atoi(argv[2]) : 0;
int numThreads = (argc > 3) ? atoi(argv[3]) : 2;
int *X = malloc(N * sizeof(int));
int *tmp = malloc(N * sizeof(int));

omp_set_dynamic(0); /** Explicitly disable dynamic teams **/
omp_set_num_threads(numThreads); /** Use N threads for all parallel regions **/

// Dealing with fail memory allocation
if(!X || !tmp)
{
if(X) free(X);
if(tmp) free(tmp);
return (EXIT_FAILURE);
}

fillupRandomly (X, N, 0, 5);

double begin = omp_get_wtime();
#pragma omp parallel
{
#pragma omp single
mergeSort(X, N, tmp);
}
double end = omp_get_wtime();
printf("Time: %f (s) \n",end-begin);

assert(1 == isSorted(X, N));

if(print){
printArray(X, N);
}

free(X);
free(tmp);
return (EXIT_SUCCESS);
}

4 核机器上的 had-doc 基准测试产生以下结果:

100000000 elements 
1 thread : Time: 11.052081 (s)
2 threads: Time: 5.907508 (s)
4 threads: Time: 4.984839 (s)

A overall Speed up of 2.21x

future 的改进将在 GitHub 上提供。


并行版本的高级 C++ 版本可以在 here 中找到。最终算法如下所示:

void mergeSortRecursive(vector<double>& v, unsigned long left, unsigned long right) {
if (left < right) {
if (right-left >= 32) {
unsigned long mid = (left+right)/2;
#pragma omp taskgroup
{
#pragma omp task shared(v) untied if(right-left >= (1<<14))
mergeSortRecursive(v, left, mid);
#pragma omp task shared(v) untied if(right-left >= (1<<14))
mergeSortRecursive(v, mid+1, right);
#pragma omp taskyield
}
inplace_merge(v.begin()+left, v.begin()+mid+1, v.begin()+right+1);
}else{
sort(v.begin()+left, v.begin()+right+1);
}
}
}
}


void mergeSort(vector<double>& v) {
#pragma omp parallel
#pragma omp single
mergeSortRecursive(v, 0, v.size()-1);
}

报告的 48 线程加速为 6.61x

关于c++ - OpenMP 中的并行合并排序,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/13811114/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com