gpt4 book ai didi

c - 使用 C/OpenMP 时的性能问题

转载 作者:行者123 更新时间:2023-12-04 07:22:29 25 4
gpt4 key购买 nike

我写了一些代码来测试使用 C 和 OpenMP 的小程序的执行时间,我遇到了一些应用程序执行时间的问题。
这是一段代码,负责添加 2 个 vector :

  float *x_f = (float *)malloc(sizeof(float) * DATA_SIZE);
float *y_f = (float *)malloc(sizeof(float) * DATA_SIZE);

for (int i = 0; i < DATA_SIZE; i++) {

x_f[i] = 1.0f;
y_f[i] = 2.0f;
}

start_time = omp_get_wtime();

#pragma omp parallel num_threads(N_THREADS)
{
const int thread_id = omp_get_thread_num();

int begin = range * thread_id;
int end = begin + range;
if (thread_id + 1 == N_THREADS)
end = DATA_SIZE;

for (int i = begin; i < end; i++) {
x_f[i] += y_f[i];
}
}
end_time = omp_get_wtime();
duration = end_time - start_time;
我用来添加具有双类型元素的 vector 的相同代码 - 在同一个应用程序中。程序似乎工作正常,但结果对我来说有点奇怪,因为浮点计算时间比两倍大几倍。
Float      1       2       3       4      5        6      7        8       9      10      11      12
1024 0,0036 0,4535 0,6875 0,9443 1,1653 1,5068 1,6951 2,0447 2,3546 2,6611 3,1319 3,1468
double 1 2 3 4 5 6 7 8 9 10 11 12
1024 0,0004 0,0014 0,0016 0,0019 0,0018 0,0018 0,002 0,0021 0,0024 0,0028 0,0045 0,0036
1-12 是 OmpeMP 线程的数量,1024 是元素中的 vector 大小。时间以毫秒为单位。有人可以解释我,为什么会发生或我做错了什么?我是 C 和 OpenMP 的新手,我不知道为什么结果如上。
编辑:下面的完整源代码:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>


int main(int argc, char const *argv[]) {
double start_time, end_time, duration;

if (argv[1] == NULL || argv[2] == NULL) {
printf("Error parsing data from input. Program will now close");
return 2;
}
int DATA_SIZE = atoi(argv[1]);
int N_THREADS = atoi(argv[2]);

int range = DATA_SIZE / N_THREADS;

// ===================== FLOAT ========================
float *x_f = (float *)malloc(sizeof(float) * DATA_SIZE);
float *y_f = (float *)malloc(sizeof(float) * DATA_SIZE);

for (int i = 0; i < DATA_SIZE; i++) {

x_f[i] = 1.0f;
y_f[i] = 2.0f;
}

start_time = omp_get_wtime();
#pragma omp parallel num_threads(N_THREADS)
{
const int thread_id = omp_get_thread_num();

int begin = range * thread_id;
int end = begin + range;
if (thread_id + 1 == N_THREADS)
end = DATA_SIZE;

for (int i = begin; i < end; i++) {
x_f[i] += y_f[i];
/*printf("x_f[%d]=%f\ty_f[%d]=%f\tThreadNum=%d\n", i, x_f[i], i,
y_f[i],
omp_get_thread_num());*/
}
}

end_time = omp_get_wtime();
duration = end_time - start_time;

// Error checking
for (int i = 0; i < DATA_SIZE; i++) {
if (!(3.0 - x_f[i]) == 0) {
printf("ERROR: %f\n", x_f[i]);
break;
}
}
free(x_f);
free(y_f);

printf("==========[FLOAT]==========\n");
printf("Number of threads: %d\n", N_THREADS);
printf("Data size: %d bytes\n", DATA_SIZE * sizeof(float));
printf("ExecTime: %lf ms\n", duration * 1000);

// ===================== DOUBLE ========================
double *x_lf = (double *)malloc(sizeof(double) * DATA_SIZE);
double *y_lf = (double *)malloc(sizeof(double) * DATA_SIZE);

for (int i = 0; i < DATA_SIZE; i++) {

x_lf[i] = 1.0f;
y_lf[i] = 2.0f;
}

start_time = omp_get_wtime();

#pragma omp parallel num_threads(N_THREADS)
{
const int thread_id = omp_get_thread_num();

int begin = range * thread_id;
int end = begin + range;
if (thread_id + 1 == N_THREADS)
end = DATA_SIZE;

for (int i = begin; i < end; i++) {
x_lf[i] += y_lf[i];
/*printf("x_f[%d]=%f\ty_f[%d]=%f\tThreadNum=%d\n", i, x_lf[i], i,
y_lf[i], omp_get_thread_num());*/
}
}

end_time = omp_get_wtime();
duration = end_time - start_time;

// Error checking
for (int i = 0; i < DATA_SIZE; i++) {
if (!(3.0 - x_lf[i]) == 0) {
printf("ERROR: %f\n", x_lf[i]);
break;
}
}
free(x_lf);
free(y_lf);

printf("\n==========[DOUBLE]==========\n");
printf("Number of threads: %d\n", N_THREADS);
printf("Data size: %d bytes\n", DATA_SIZE * sizeof(double));
printf("ExecTime: %lf ms\n", duration * 1000);

return 0;
}
编辑2:完整的结果表
Results

最佳答案

Here is the code refactored .也许有意义。使用 sheredom/ubench .
请找出编译器参数:-fopenmp -s -lm -O3 .一起玩 -O并找出不同之处。
(作为旁注,请阅读:32 OpenMP Traps for C++ Developers)
这是代码:


#include "https://raw.githubusercontent.com/sheredom/ubench.h/master/ubench.h"

#include <assert.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

#define DATA_SIZE 100000
#define N_THREADS 2
#define range DATA_SIZE/N_THREADS

static struct {
float x_f[DATA_SIZE] ;
float y_f[DATA_SIZE] ;
double x_d[DATA_SIZE] ;
double y_d[DATA_SIZE] ;
} * app_data = 0 ;

static void app_start (void)
{
app_data = calloc(1, sizeof(*app_data) );
assert(app_data) ;

for (int i = 0; i < DATA_SIZE; ++i) {
app_data->x_f[i] = 1.0f;
app_data->y_f[i] = 2.0f;
app_data->x_d[i] = 1.0;
app_data->y_d[i] = 2.0;
}
}

UBENCH( omp_measuring, adding_two_arrays_of_floats )
{
#pragma omp parallel num_threads(N_THREADS)
{
const int thread_id = omp_get_thread_num();

const int begin = range * thread_id;
int end = begin + range;
if (thread_id + 1 == N_THREADS)
end = DATA_SIZE;

for (int i = begin; i < end; i++) {
app_data->x_f[i] += app_data->y_f[i];
}
}
}

UBENCH( omp_measuring, adding_two_arrays_of_doubles )
{
#pragma omp parallel num_threads(N_THREADS)
{
const int thread_id = omp_get_thread_num();

const int begin = range * thread_id;
int end = begin + range;
if (thread_id + 1 == N_THREADS)
end = DATA_SIZE;

for (int i = begin; i < end; i++) {
app_data->x_d[i] += app_data->y_d[i];
}
}
}

static void app_end (void) { free(app_data); }


UBENCH_STATE();

int main(int argc, const char *const argv[])
{
app_start();

ubench_main(argc, argv);

app_end();

}
买者自负
OMP 的全部意义(非常)没有实际意义,因为 OMP 显然只有在不使用优化的情况下才有效。打败gnuc很难 -O3 .
Godbolt 的问题在于它在大型数据集上很卡。在 VLA(又名超大阵列)上使用 OMP 应该是有意义的。
实际上,很少需要处理兆字节或千兆字节的数据集。然后人们可能会想象 GPU 正在被使用。没有 OMP。
奖金 app_start UBENCH 没有规定。可以将应用程序命令行参数传递给不同的 app_start :
static void app_start(const unsigned argc, char ** argv )
{
// use app arguments here
}

int main (int argc, char ** argv)
{
app_start(argc,argv);
ubench_main(argc,argv);
app_end();
}

关于c - 使用 C/OpenMP 时的性能问题,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/68412244/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com