gpt4 book ai didi

performance - MPI_Send/Recv 和 MPI_Scatter/Gather 的比较

转载 作者:行者123 更新时间:2023-12-04 03:22:34 25 4
gpt4 key购买 nike

我正在使用 MPI 分离矩阵并将它们发送到 N 个进程,但我发现 MPI_Scatter/Gather 不够有效。我写了两个程序来比较 MPI_Send/Recv 和 MPI_Scatter/Gather。

MPI_发送/接收:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>
#include <string.h>

#define MASTER 0

double* create_matrix(uint32_t nrow, uint32_t ncol);
double* zero_matrix(uint32_t nrow, uint32_t ncol);
double* create_vector(uint32_t n);
int print_matrix(double *m, uint32_t nrow, uint32_t ncol);

int main( int argc, char** argv )
{
double *A, *B, *C, *A_buf, *C_buf;
double t_start, t_end, buf;
uint32_t M; //number of rows
uint32_t N; //number of columns
uint32_t nrows, size, rank, recv_len;
MPI_Datatype MPI_MATRIX, MPI_VECTOR, MPI_SUB_VECTOR;
MPI_Comm comm;
MPI_Status status;

M = (atoi(argv[1]) > 0)?atoi(argv[1]):1;
N = (atoi(argv[2]) > 0)?atoi(argv[2]):1;

MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);

nrows = M/size;
//create derived data type
MPI_Type_contiguous(nrows*N, MPI_DOUBLE, &MPI_MATRIX);
MPI_Type_commit(&MPI_MATRIX);
MPI_Type_contiguous(N, MPI_DOUBLE, &MPI_VECTOR);
MPI_Type_commit(&MPI_VECTOR);
MPI_Type_contiguous(nrows, MPI_DOUBLE, &MPI_SUB_VECTOR);
MPI_Type_commit(&MPI_SUB_VECTOR);

if(rank == MASTER)
{
//A: M*N
A = create_matrix(M, N);
C = create_matrix(M, 1);

if(A == NULL || C == NULL)
{
printf( "Allocation of matrix failed.\n" );
exit(EXIT_FAILURE);
}
}

B = create_vector(N);
A_buf = create_matrix(nrows, N);
C_buf = zero_matrix(nrows, 1);

if(B == NULL || A_buf == NULL || C_buf == NULL)
{
printf( "Allocation of matrix failed.\n" );
exit(EXIT_FAILURE);
}

if(rank == MASTER)
{
//exclude the time of establishing TCP connections
for(int i = 1;i < size;i++)
MPI_Send(&buf, 1, MPI_DOUBLE, i, 0, MPI_COMM_WORLD);

t_start = MPI_Wtime();
for(int i = 0;i < nrows*N;i++)
A_buf[i] = A[i];

//send submatrix to other processes
for(int i = 1;i < size;i++)
{
MPI_Send(&A[i*nrows*N], 1, MPI_MATRIX, i, 0, MPI_COMM_WORLD);
MPI_Send(B, 1, MPI_VECTOR, i, 0, MPI_COMM_WORLD);
}
}
else
{
//receive to establish connection with MASTER
MPI_Recv(&buf, 1, MPI_DOUBLE, MASTER, 0, MPI_COMM_WORLD, &status);

//receive matrix
MPI_Recv(A_buf, 1, MPI_MATRIX, MASTER, 0, MPI_COMM_WORLD, &status);
MPI_Recv(B, 1, MPI_VECTOR, MASTER, 0, MPI_COMM_WORLD, &status);
}
MPI_Barrier(MPI_COMM_WORLD);

if(rank == MASTER)
{
for(int i = 0;i < nrows;i++)
C[i] = C_buf[i];

for(int i = 1;i < size;i++)
MPI_Recv(&C[i*nrows], 1, MPI_SUB_VECTOR, i, 0, MPI_COMM_WORLD, &status);

t_end = MPI_Wtime();
printf("%dx%d/%d: %7.4f\n", M, N, size, t_end - t_start);
}
else
{
MPI_Send(C_buf, 1, MPI_SUB_VECTOR, MASTER, 0, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);

MPI_Type_free(&MPI_MATRIX);
MPI_Type_free(&MPI_VECTOR);
MPI_Type_free(&MPI_SUB_VECTOR);

if(rank == MASTER)
{
free(A);
free(C);
}

free(B);
free(A_buf);
free(C_buf);

MPI_Finalize();

return EXIT_SUCCESS;
}

double* create_matrix(uint32_t nrow, uint32_t ncol)
{
double *matrix = (double *)malloc(sizeof(double)*nrow*ncol);
if(matrix == NULL)
{
return NULL;
}

srand((unsigned)time(NULL));

for(uint32_t i = 0;i < nrow*ncol;i++)
{
matrix[i] = (double)1;
}

return matrix;
}


double* zero_matrix(uint32_t nrow, uint32_t ncol)
{
double* matrix = (double *)malloc(sizeof(double)*nrow*ncol);
if(matrix == NULL)
{
return NULL;
}

for(uint32_t i = 0;i < nrow*ncol;i++)
{
matrix[i] = (double)0;
}

return matrix;
}

double* create_vector(uint32_t n)
{
return create_matrix(n, 1);
}

MPI_分散/聚集:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>
#include <string.h>

#define MASTER 0
double* create_matrix(uint32_t nrow, uint32_t ncol);
double* zero_matrix(uint32_t nrow, uint32_t ncol);

int main( int argc, char** argv )
{
double t_start, t_end, buf;
double *A, *B, *C, *A_buf, *C_buf;
uint32_t M; //number of rows
uint32_t N; //number of columns
uint32_t nrows, size, rank;
uint32_t i_start, i_end;
MPI_Comm comm;
MPI_Status status;
MPI_Datatype MPI_MATRIX, MPI_VECTOR, MPI_RESULT;

M = (atoi(argv[1]) > 0)?atoi(argv[1]):1;
N = (atoi(argv[2]) > 0)?atoi(argv[2]):1;

MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);

nrows = M/size;
//create derived data type
MPI_Type_contiguous(nrows*N, MPI_DOUBLE, &MPI_MATRIX);
MPI_Type_commit(&MPI_MATRIX);
MPI_Type_contiguous(N, MPI_DOUBLE, &MPI_VECTOR);
MPI_Type_commit(&MPI_VECTOR);
MPI_Type_contiguous(nrows, MPI_DOUBLE, &MPI_RESULT);
MPI_Type_commit(&MPI_RESULT);

if(rank == MASTER)
{
//A: M*N
A = zero_matrix(M, N);
C = create_matrix(M, 1);

if(A == NULL || C == NULL)
{
printf( "Allocation of matrix failed.\n" );
exit(EXIT_FAILURE);
}
}

B = zero_matrix(N, 1);
A_buf = create_matrix(nrows, N);
C_buf = create_matrix(nrows, 1);

if(B == NULL || A_buf == NULL || C_buf == NULL)
{
printf( "Allocation of matrix failed.\n" );
exit(EXIT_FAILURE);
}

//exclude the time of establishing TCP connections
MPI_Bcast(&buf, 1, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);

if(rank == MASTER)
{
t_start = MPI_Wtime();
}

// scatter A
MPI_Scatter(A, 1, MPI_MATRIX, A_buf, 1, MPI_MATRIX, 0, MPI_COMM_WORLD);

// broadcast B
MPI_Bcast(B, 1, MPI_VECTOR, 0, MPI_COMM_WORLD);

// gather C
MPI_Gather(C_buf, 1, MPI_RESULT, C, 1, MPI_RESULT, 0, MPI_COMM_WORLD);

if(rank == MASTER)
{
t_end = MPI_Wtime();
printf("%d %7.4f\n", size, t_end - t_start);

free(A);
free(C);
}

MPI_Type_free(&MPI_MATRIX);
MPI_Type_free(&MPI_VECTOR);
MPI_Type_free(&MPI_RESULT);

free(B);
free(A_buf);
free(C_buf);

return EXIT_SUCCESS;
}

double* create_matrix(uint32_t nrow, uint32_t ncol)
{
double *matrix = (double *)malloc(sizeof(double)*nrow*ncol);
if(matrix == NULL)
{
return NULL;
}

srand((unsigned)time(NULL));

for(uint32_t i = 0;i < nrow*ncol;i++)
{
matrix[i] = (double)rand();
}

return matrix;
}


double* zero_matrix(uint32_t nrow, uint32_t ncol)
{
double* matrix = (double *)malloc(sizeof(double)*nrow*ncol);
if(matrix == NULL)
{
return NULL;
}

for(uint32_t i = 0;i < nrow*ncol;i++)
{
matrix[i] = (double)1;
}

return matrix;
}

我使用以下脚本来运行它们:
#!/bin/bash
dims="4096"
ntasks="1 2 4 8"
echo -n "" > log
for dim in $dims;
do
echo "dim=$dim:"
for n in $ntasks;
do
srun --ntasks=$n --ntasks-per-node=1 --cpu-freq=2900000 ./matrix $dim $dim | tee -a log
done
done

转运时间:
program        |   N=1   |   N=2   |   N=4   |   N=8   |
--------------------------------------------------------
send/recv | 0.0684s | 0.0638s | 0.0654s | 0.0638s |
scatter/gather | 0.0367s | 0.0492s | 0.0765s | 0.1283s |

scatter/gather 传输数据的时间增加得如此之快,我还有理由使用它来代替 send/recv 循环吗?我知道 scatter 是 send 的 wrap 和 gather 是 recv 的 wrap ,但他们除了做什么?

最佳答案

为了澄清起见, MPI_Scatter 和 MPI_Gather (最有可能)在幕后使用 MPI_Send AND MPI_Recv 。

从您的代码示例中,您似乎并不真正了解 MPI 的工作原理:

  • 您不需要执行接收或发送操作来“建立连接”。 MPI 操作通常会隐式地处理这个问题。
  • 在您的 Gather/Scatter 示例中,您首先使用 MPI_Scatter 分发数据。 ,然后你用 MPI_Bcast 广播更多的数据,然后您只需再次使用 MPI_Gather 收集数据,中间不做任何计算。
  • 在您的示例中,您不需要与 MPI_Barrier 进行任何显式同步。 .

  • 一旦正确构建程序,您将看到性能的大幅提升。除了这些问题之外,MPI 还存在一个问题:不幸的是,MPI 标准没有提供任何性能保证,而是将其留给实际实现以尽可能做到最好。 MPI_Scatter/Gather,根据您使用的实际实现,我尝试针对大消息和/或大量进程进行优化 - 这自然会带来一些开销。

    您可以尝试不同的 MPI 实现(对于开源,请参见例如 MVARPICH)来查看您现在使用的 MPI 实现是否做得不好。但是,只有在您正确编写代码后,进行调查才有意义。

    另外,最好不要使用前缀 MPI_ .它使您的代码难以阅读,如果我没记错的话,MPI 标准为 MPI 库函数保留了前缀。

    关于performance - MPI_Send/Recv 和 MPI_Scatter/Gather 的比较,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30954310/

    25 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com