performance - MPI_Send/Recv 和 MPI_Scatter/Gather 的比较-6ren

performance - MPI_Send/Recv 和 MPI_Scatter/Gather 的比较

转载作者：行者123 更新时间：2023-12-04 03:22:34

我正在使用 MPI 分离矩阵并将它们发送到 N 个进程，但我发现 MPI_Scatter/Gather 不够有效。我写了两个程序来比较 MPI_Send/Recv 和 MPI_Scatter/Gather。

MPI_发送/接收:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>
#include <string.h>

#define MASTER 0

double* create_matrix(uint32_t nrow, uint32_t ncol);
double* zero_matrix(uint32_t nrow, uint32_t ncol);
double* create_vector(uint32_t n);
int print_matrix(double *m, uint32_t nrow, uint32_t ncol);

int main( int argc, char** argv )
{
    double *A, *B, *C, *A_buf, *C_buf;
    double t_start, t_end, buf;
    uint32_t M; //number of rows
    uint32_t N; //number of columns
    uint32_t nrows, size, rank, recv_len;
    MPI_Datatype MPI_MATRIX, MPI_VECTOR, MPI_SUB_VECTOR;
    MPI_Comm comm;
    MPI_Status status;

    M = (atoi(argv[1]) > 0)?atoi(argv[1]):1;
    N = (atoi(argv[2]) > 0)?atoi(argv[2]):1;

    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    nrows = M/size;
    //create derived data type
    MPI_Type_contiguous(nrows*N, MPI_DOUBLE, &MPI_MATRIX);
    MPI_Type_commit(&MPI_MATRIX);
    MPI_Type_contiguous(N, MPI_DOUBLE, &MPI_VECTOR);
    MPI_Type_commit(&MPI_VECTOR);
    MPI_Type_contiguous(nrows, MPI_DOUBLE, &MPI_SUB_VECTOR);
    MPI_Type_commit(&MPI_SUB_VECTOR);

    if(rank == MASTER)
    {
        //A: M*N
        A = create_matrix(M, N);
        C = create_matrix(M, 1);

        if(A == NULL || C == NULL)
        {
            printf( "Allocation of matrix failed.\n" );
            exit(EXIT_FAILURE);
        }
    }

    B = create_vector(N);
    A_buf = create_matrix(nrows, N);
    C_buf = zero_matrix(nrows, 1);

    if(B == NULL || A_buf == NULL || C_buf == NULL)
    {
        printf( "Allocation of matrix failed.\n" );
        exit(EXIT_FAILURE);
    }

    if(rank == MASTER)
    {
        //exclude the time of establishing TCP connections
        for(int i = 1;i < size;i++)
            MPI_Send(&buf, 1, MPI_DOUBLE, i, 0, MPI_COMM_WORLD);

        t_start = MPI_Wtime();
        for(int i = 0;i < nrows*N;i++)
            A_buf[i] = A[i];

        //send submatrix to other processes
        for(int i = 1;i < size;i++)
        {
            MPI_Send(&A[i*nrows*N], 1, MPI_MATRIX, i, 0, MPI_COMM_WORLD);
            MPI_Send(B, 1, MPI_VECTOR, i, 0, MPI_COMM_WORLD);
        }
    }
    else
    {
        //receive to establish connection with MASTER
        MPI_Recv(&buf, 1, MPI_DOUBLE, MASTER, 0, MPI_COMM_WORLD, &status);

        //receive matrix
        MPI_Recv(A_buf, 1, MPI_MATRIX, MASTER, 0, MPI_COMM_WORLD, &status);    
        MPI_Recv(B, 1, MPI_VECTOR, MASTER, 0, MPI_COMM_WORLD, &status);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    if(rank == MASTER)
    {
        for(int i = 0;i < nrows;i++)
            C[i] = C_buf[i];

        for(int i = 1;i < size;i++)
            MPI_Recv(&C[i*nrows], 1, MPI_SUB_VECTOR, i, 0, MPI_COMM_WORLD, &status);

        t_end = MPI_Wtime();
        printf("%dx%d/%d: %7.4f\n", M, N, size, t_end - t_start);
    }
    else
    {
        MPI_Send(C_buf, 1, MPI_SUB_VECTOR, MASTER, 0, MPI_COMM_WORLD);
    }
    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Type_free(&MPI_MATRIX);
    MPI_Type_free(&MPI_VECTOR);
    MPI_Type_free(&MPI_SUB_VECTOR);

    if(rank == MASTER)
    {
        free(A);
        free(C);
    }

    free(B);
    free(A_buf);
    free(C_buf);

    MPI_Finalize();

    return EXIT_SUCCESS;
}

double* create_matrix(uint32_t nrow, uint32_t ncol)
{
    double *matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    srand((unsigned)time(NULL));

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)1;
    }

    return matrix;
}


double* zero_matrix(uint32_t nrow, uint32_t ncol)
{
    double* matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)0;
    }

    return matrix;
}

double* create_vector(uint32_t n)
{
    return create_matrix(n, 1);
}

MPI_分散/聚集:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>
#include <string.h>

#define MASTER 0
double* create_matrix(uint32_t nrow, uint32_t ncol);
double* zero_matrix(uint32_t nrow, uint32_t ncol);

int main( int argc, char** argv )
{
    double t_start, t_end, buf;
    double *A, *B, *C, *A_buf, *C_buf;
    uint32_t M; //number of rows
    uint32_t N; //number of columns
    uint32_t nrows, size, rank;
    uint32_t i_start, i_end;
    MPI_Comm comm;
    MPI_Status status;
    MPI_Datatype MPI_MATRIX, MPI_VECTOR, MPI_RESULT;

    M = (atoi(argv[1]) > 0)?atoi(argv[1]):1;
    N = (atoi(argv[2]) > 0)?atoi(argv[2]):1;

    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    nrows = M/size;
    //create derived data type
    MPI_Type_contiguous(nrows*N, MPI_DOUBLE, &MPI_MATRIX);
    MPI_Type_commit(&MPI_MATRIX);
    MPI_Type_contiguous(N, MPI_DOUBLE, &MPI_VECTOR);
    MPI_Type_commit(&MPI_VECTOR);
    MPI_Type_contiguous(nrows, MPI_DOUBLE, &MPI_RESULT);
    MPI_Type_commit(&MPI_RESULT);

    if(rank == MASTER)
    {
        //A: M*N
        A = zero_matrix(M, N);
        C = create_matrix(M, 1);

        if(A == NULL || C == NULL)
        {
            printf( "Allocation of matrix failed.\n" );
            exit(EXIT_FAILURE);
        }
    }

    B = zero_matrix(N, 1);
    A_buf = create_matrix(nrows, N);
    C_buf = create_matrix(nrows, 1);

    if(B == NULL || A_buf == NULL || C_buf == NULL)
    {
        printf( "Allocation of matrix failed.\n" );
        exit(EXIT_FAILURE);
    }

    //exclude the time of establishing TCP connections
    MPI_Bcast(&buf, 1, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);
    MPI_Barrier(MPI_COMM_WORLD);

    if(rank == MASTER)
    {
        t_start = MPI_Wtime();
    }

    // scatter A
    MPI_Scatter(A, 1, MPI_MATRIX, A_buf, 1, MPI_MATRIX, 0, MPI_COMM_WORLD);

    // broadcast B
    MPI_Bcast(B, 1, MPI_VECTOR, 0, MPI_COMM_WORLD);

    // gather C
    MPI_Gather(C_buf, 1, MPI_RESULT, C, 1, MPI_RESULT, 0, MPI_COMM_WORLD);

    if(rank == MASTER)
    {
        t_end = MPI_Wtime();
        printf("%d %7.4f\n", size, t_end - t_start);

        free(A);
        free(C);
    }

    MPI_Type_free(&MPI_MATRIX);
    MPI_Type_free(&MPI_VECTOR);
    MPI_Type_free(&MPI_RESULT);

    free(B);
    free(A_buf);
    free(C_buf);

    return EXIT_SUCCESS;
}

double* create_matrix(uint32_t nrow, uint32_t ncol)
{
    double *matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    srand((unsigned)time(NULL));

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)rand();
    }

    return matrix;
}


double* zero_matrix(uint32_t nrow, uint32_t ncol)
{
    double* matrix = (double *)malloc(sizeof(double)*nrow*ncol);
    if(matrix == NULL)
    {
        return NULL;
    }

    for(uint32_t i = 0;i < nrow*ncol;i++)
    {
        matrix[i] = (double)1;
    }

    return matrix;
}

我使用以下脚本来运行它们:

#!/bin/bash
dims="4096"
ntasks="1 2 4 8"
echo -n "" > log
for dim in $dims;
do
    echo "dim=$dim:"
    for n in $ntasks;
    do
        srun --ntasks=$n --ntasks-per-node=1 --cpu-freq=2900000 ./matrix $dim $dim | tee -a log
    done
done

转运时间:

program        |   N=1   |   N=2   |   N=4   |   N=8   |
--------------------------------------------------------
send/recv      | 0.0684s | 0.0638s | 0.0654s | 0.0638s |
scatter/gather | 0.0367s | 0.0492s | 0.0765s | 0.1283s |

scatter/gather 传输数据的时间增加得如此之快，我还有理由使用它来代替 send/recv 循环吗？我知道 scatter 是 send 的 wrap 和 gather 是 recv 的 wrap ，但他们除了做什么？

最佳答案

为了澄清起见， MPI_Scatter 和 MPI_Gather (最有可能)在幕后使用 MPI_Send AND MPI_Recv 。

从您的代码示例中，您似乎并不真正了解 MPI 的工作原理:

您不需要执行接收或发送操作来“建立连接”。 MPI 操作通常会隐式地处理这个问题。

在您的 Gather/Scatter 示例中，您首先使用 MPI_Scatter 分发数据。，然后你用 MPI_Bcast 广播更多的数据，然后您只需再次使用 MPI_Gather 收集数据，中间不做任何计算。

在您的示例中，您不需要与 MPI_Barrier 进行任何显式同步。 .

一旦正确构建程序，您将看到性能的大幅提升。除了这些问题之外，MPI 还存在一个问题:不幸的是，MPI 标准没有提供任何性能保证，而是将其留给实际实现以尽可能做到最好。 MPI_Scatter/Gather，根据您使用的实际实现，我尝试针对大消息和/或大量进程进行优化 - 这自然会带来一些开销。

您可以尝试不同的 MPI 实现(对于开源，请参见例如 MVARPICH)来查看您现在使用的 MPI 实现是否做得不好。但是，只有在您正确编写代码后，进行调查才有意义。

另外，最好不要使用前缀 MPI_ .它使您的代码难以阅读，如果我没记错的话，MPI 标准为 MPI 库函数保留了前缀。

关于performance - MPI_Send/Recv 和 MPI_Scatter/Gather 的比较，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/30954310/

文章推荐： ruby-on-rails-3 - 无法在山狮上安装json gem

文章推荐： fish :遍历数组/列表

c++ - poll with recv、epoll with recv 和 simple recv 之间的性能差异？
关闭。这个问题需要details or clarity .它目前不接受答案。想改进这个问题吗？通过 editing this post 添加细节并澄清问题. 关闭 8 年前。 Improve t
c++ - UDP recv/recv 来自多个发件人
美好的一天，我正在用 VC++ 开发一个应用程序，它使用 UDP 协议(protocol)与 Windows XP 上的 winsock 进行通信。以前我已经能够假设该工具接收到的所有数据包都来自一
c - 如何在没有 eof 的情况下 recv 直到没有更多的 recv？
所以我需要从服务器向客户端接收一个 html 文件，该文件比缓冲区大所以我发送了几次。这就是为什么我在接收时有这个循环 while (i = recv(s, buf, TAM_BUFFER, 0))
c - 在套接字库中调用 recv 时，我的 recv 缓冲区应该有多大
我有几个关于 C 中的套接字库的问题。这是我将在问题中引用的代码片段。 char recv_buffer[3000]; recv(socket, recv_buffer, 3000, 0); 我如何决
c - 在阻塞 recv 调用的情况下，如果对等方系统重新启动，则调用不会来自 out recv。为什么？
当我的代码处于阻塞的 recv 调用中时，如果另一端重新启动，那么这一端的 recv 调用不会知道它，只会进入挂起状态。如何避免这种情况？最佳答案默认情况下，如果连接的另一端在没有正确终止连接的
c - 第二次 recv 调用后，recv 始终读取 0 字节
我有一个在平台上运行的 TCP 服务器，它从缓冲区读取字节并在一个发送调用中通过网络发送所有字节: send(sSocket.client_sock, base, frb_size, 0); frb_
我可以确定在使用 MSG_PEEK 成功调用 recv 后立即执行的 UDP recv 不会阻塞吗？
假设以下代码(为了简单起见，我在这里省略了必要的错误处理): recv(sockfd, NULL, 0, MSG_PEEK); recv(sockfd, buff, bufflen, 0); 在那种情
ubuntu - 如何使用 --recv-keys 而不是 --recv 添加 apt key ？
我想用 ansible 安装 facebook osquery。 ubuntu的使用说明如下: sudo apt-key adv --keyserver keyserver.ubuntu.com --
c++ - 面对 recv() 和 send() winsock api 的问题。 Recv() 在接收最后一个数据包时挂起
我遇到了 recv() 和 send() winsock api 的问题。 Recv() 在接收最后一个数据包时挂起。问题描述:- 系统 A 的应用正在通过非阻塞套接字写入数据，而系统 B 的应用正
python - Python TCP 套接字 recv(1) 与 recv(n) 的效率
我正在构建一个 TCP 应用程序，它使用换行符 \n 来分隔未知长度(但通常小于 64 字节)的消息。我正在寻找 this article非常有用。是一次recv一个字符并检查它是否为换行符或r
linux - 当 recv buf 已经有数据时，tcp socket recv 会返回 "resource temporarily unavailable"
我使用 epoll 实现 TCP 套接字通信来监视所有客户端事件，只有一个线程在一个 for 循环中处理所有客户端。每个套接字都是非阻塞的。现在我遇到了一个问题，当客户端发送的数据超过MTU时，意味
python - python 中的 socket 方法 recv(bufsize) 和 c 中的 recv(char *buf) 中的真正缓冲区是什么？
在python中，方法是: socket.recv(bufsize[, flags]) 在C中，方法是: int recv( _In_ SOCKET s, _Out_ char *buf, _In_
从两个线程在同一个阻塞套接字上调用 recv()
如果我有一个套接字 s 会发生什么，它上面当前没有可用数据，它是一个阻塞套接字，我从两个套接字上调用 recv线程一次？其中一个线程会获取数据吗？双方都会得到吗？第二次调用 recv 会返回错误吗？
sockets - recv()返回0
我有一个非常烦人的问题，我在其他论坛上发现了几次，但是找不到合适的解决方案。问题是recv()在连接的最后几个字节上返回0。以下是一些背景信息。两种(客户端/服务器)应用程序都在同一台计算机上运行
c - recv() 上的段错误
我正在使用 C 语言在 Unix 上编写客户端/服务器程序，使用发送/接收。我偶尔会从 recv 调用中遇到段错误。该行为不能完全重现；有时它会发生，有时程序会运行到完成。知道这意味着什么吗？最佳
python - 模拟socket.recv()
我想 mock 套接字的recv函数并遇到麻烦。即使我没有必要，也要获取实际上连接套接字所需的错误。这是所有相关代码: Class A: def __init__.py(self):
sockets - Recv()32位和64位操作系统之间的行为差异
我已经编写了一个服务器-客户端程序，我想问一问:32位和64位操作系统之间recv()函数的行为是否有所不同。我之所以这样问是因为我在64位笔记本电脑上同时运行服务器和客户端，并且一切正常。我这样调
python - 如何在python中实现线程socket.recv()？
我有许多设备需要从中获取状态更新。我只需要一个套接字对象，而我只需要 socket.recv() 来获取状态。放入单线程应用程序，不会出现问题: class Device: def recei
c - RECV 动态内存分配
在 C 中，我希望读取页面的 html，但我希望保存数据的缓冲区是动态的。我知道我必须通过循环和使用 realloc 来执行此操作但我不太确定我将如何去做。假设我的套接字( sock )已经打开，请考
c - recv 返回旧数据
这个循环应该逐行从套接字中获取数据并将其放入缓冲区。出于某种原因，当没有新数据返回时， recv 返回它得到的最后几行。我能够通过注释掉第一个 recv 来阻止这个错误，但是我不知道下一行会有多长时间

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

performance - MPI_Send/Recv 和 MPI_Scatter/Gather 的比较