I am using MPI to separate a matrix and send them to N processes, but I found that MPI_Scatter/Gather are not effcient enough. I wrote two programs to compare MPI_Send/Recv and MPI_Scatter/Gather.
MPI_Send/Recv:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>
#include <string.h>
#define MASTER 0
double* create_matrix(uint32_t nrow, uint32_t ncol);
double* zero_matrix(uint32_t nrow, uint32_t ncol);
double* create_vector(uint32_t n);
int print_matrix(double *m, uint32_t nrow, uint32_t ncol);
int main( int argc, char** argv )
{
double *A, *B, *C, *A_buf, *C_buf;
double t_start, t_end, buf;
uint32_t M; //number of rows
uint32_t N; //number of columns
uint32_t nrows, size, rank, recv_len;
MPI_Datatype MPI_MATRIX, MPI_VECTOR, MPI_SUB_VECTOR;
MPI_Comm comm;
MPI_Status status;
M = (atoi(argv[1]) > 0)?atoi(argv[1]):1;
N = (atoi(argv[2]) > 0)?atoi(argv[2]):1;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
nrows = M/size;
//create derived data type
MPI_Type_contiguous(nrows*N, MPI_DOUBLE, &MPI_MATRIX);
MPI_Type_commit(&MPI_MATRIX);
MPI_Type_contiguous(N, MPI_DOUBLE, &MPI_VECTOR);
MPI_Type_commit(&MPI_VECTOR);
MPI_Type_contiguous(nrows, MPI_DOUBLE, &MPI_SUB_VECTOR);
MPI_Type_commit(&MPI_SUB_VECTOR);
if(rank == MASTER)
{
//A: M*N
A = create_matrix(M, N);
C = create_matrix(M, 1);
if(A == NULL || C == NULL)
{
printf( "Allocation of matrix failed.\n" );
exit(EXIT_FAILURE);
}
}
B = create_vector(N);
A_buf = create_matrix(nrows, N);
C_buf = zero_matrix(nrows, 1);
if(B == NULL || A_buf == NULL || C_buf == NULL)
{
printf( "Allocation of matrix failed.\n" );
exit(EXIT_FAILURE);
}
if(rank == MASTER)
{
//exclude the time of establishing TCP connections
for(int i = 1;i < size;i++)
MPI_Send(&buf, 1, MPI_DOUBLE, i, 0, MPI_COMM_WORLD);
t_start = MPI_Wtime();
for(int i = 0;i < nrows*N;i++)
A_buf[i] = A[i];
//send submatrix to other processes
for(int i = 1;i < size;i++)
{
MPI_Send(&A[i*nrows*N], 1, MPI_MATRIX, i, 0, MPI_COMM_WORLD);
MPI_Send(B, 1, MPI_VECTOR, i, 0, MPI_COMM_WORLD);
}
}
else
{
//receive to establish connection with MASTER
MPI_Recv(&buf, 1, MPI_DOUBLE, MASTER, 0, MPI_COMM_WORLD, &status);
//receive matrix
MPI_Recv(A_buf, 1, MPI_MATRIX, MASTER, 0, MPI_COMM_WORLD, &status);
MPI_Recv(B, 1, MPI_VECTOR, MASTER, 0, MPI_COMM_WORLD, &status);
}
MPI_Barrier(MPI_COMM_WORLD);
if(rank == MASTER)
{
for(int i = 0;i < nrows;i++)
C[i] = C_buf[i];
for(int i = 1;i < size;i++)
MPI_Recv(&C[i*nrows], 1, MPI_SUB_VECTOR, i, 0, MPI_COMM_WORLD, &status);
t_end = MPI_Wtime();
printf("%dx%d/%d: %7.4f\n", M, N, size, t_end - t_start);
}
else
{
MPI_Send(C_buf, 1, MPI_SUB_VECTOR, MASTER, 0, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Type_free(&MPI_MATRIX);
MPI_Type_free(&MPI_VECTOR);
MPI_Type_free(&MPI_SUB_VECTOR);
if(rank == MASTER)
{
free(A);
free(C);
}
free(B);
free(A_buf);
free(C_buf);
MPI_Finalize();
return EXIT_SUCCESS;
}
double* create_matrix(uint32_t nrow, uint32_t ncol)
{
double *matrix = (double *)malloc(sizeof(double)*nrow*ncol);
if(matrix == NULL)
{
return NULL;
}
srand((unsigned)time(NULL));
for(uint32_t i = 0;i < nrow*ncol;i++)
{
matrix[i] = (double)1;
}
return matrix;
}
double* zero_matrix(uint32_t nrow, uint32_t ncol)
{
double* matrix = (double *)malloc(sizeof(double)*nrow*ncol);
if(matrix == NULL)
{
return NULL;
}
for(uint32_t i = 0;i < nrow*ncol;i++)
{
matrix[i] = (double)0;
}
return matrix;
}
double* create_vector(uint32_t n)
{
return create_matrix(n, 1);
}
MPI_Scatter/Gather:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#include <mpi.h>
#include <string.h>
#define MASTER 0
double* create_matrix(uint32_t nrow, uint32_t ncol);
double* zero_matrix(uint32_t nrow, uint32_t ncol);
int main( int argc, char** argv )
{
double t_start, t_end, buf;
double *A, *B, *C, *A_buf, *C_buf;
uint32_t M; //number of rows
uint32_t N; //number of columns
uint32_t nrows, size, rank;
uint32_t i_start, i_end;
MPI_Comm comm;
MPI_Status status;
MPI_Datatype MPI_MATRIX, MPI_VECTOR, MPI_RESULT;
M = (atoi(argv[1]) > 0)?atoi(argv[1]):1;
N = (atoi(argv[2]) > 0)?atoi(argv[2]):1;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
nrows = M/size;
//create derived data type
MPI_Type_contiguous(nrows*N, MPI_DOUBLE, &MPI_MATRIX);
MPI_Type_commit(&MPI_MATRIX);
MPI_Type_contiguous(N, MPI_DOUBLE, &MPI_VECTOR);
MPI_Type_commit(&MPI_VECTOR);
MPI_Type_contiguous(nrows, MPI_DOUBLE, &MPI_RESULT);
MPI_Type_commit(&MPI_RESULT);
if(rank == MASTER)
{
//A: M*N
A = zero_matrix(M, N);
C = create_matrix(M, 1);
if(A == NULL || C == NULL)
{
printf( "Allocation of matrix failed.\n" );
exit(EXIT_FAILURE);
}
}
B = zero_matrix(N, 1);
A_buf = create_matrix(nrows, N);
C_buf = create_matrix(nrows, 1);
if(B == NULL || A_buf == NULL || C_buf == NULL)
{
printf( "Allocation of matrix failed.\n" );
exit(EXIT_FAILURE);
}
//exclude the time of establishing TCP connections
MPI_Bcast(&buf, 1, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
if(rank == MASTER)
{
t_start = MPI_Wtime();
}
// scatter A
MPI_Scatter(A, 1, MPI_MATRIX, A_buf, 1, MPI_MATRIX, 0, MPI_COMM_WORLD);
// broadcast B
MPI_Bcast(B, 1, MPI_VECTOR, 0, MPI_COMM_WORLD);
// gather C
MPI_Gather(C_buf, 1, MPI_RESULT, C, 1, MPI_RESULT, 0, MPI_COMM_WORLD);
if(rank == MASTER)
{
t_end = MPI_Wtime();
printf("%d %7.4f\n", size, t_end - t_start);
free(A);
free(C);
}
MPI_Type_free(&MPI_MATRIX);
MPI_Type_free(&MPI_VECTOR);
MPI_Type_free(&MPI_RESULT);
free(B);
free(A_buf);
free(C_buf);
return EXIT_SUCCESS;
}
double* create_matrix(uint32_t nrow, uint32_t ncol)
{
double *matrix = (double *)malloc(sizeof(double)*nrow*ncol);
if(matrix == NULL)
{
return NULL;
}
srand((unsigned)time(NULL));
for(uint32_t i = 0;i < nrow*ncol;i++)
{
matrix[i] = (double)rand();
}
return matrix;
}
double* zero_matrix(uint32_t nrow, uint32_t ncol)
{
double* matrix = (double *)malloc(sizeof(double)*nrow*ncol);
if(matrix == NULL)
{
return NULL;
}
for(uint32_t i = 0;i < nrow*ncol;i++)
{
matrix[i] = (double)1;
}
return matrix;
}
I used the following script to run them both:
#!/bin/bash
dims="4096"
ntasks="1 2 4 8"
echo -n "" > log
for dim in $dims;
do
echo "dim=$dim:"
for n in $ntasks;
do
srun --ntasks=$n --ntasks-per-node=1 --cpu-freq=2900000 ./matrix $dim $dim | tee -a log
done
done
transferring time:
program | N=1 | N=2 | N=4 | N=8 |
--------------------------------------------------------
send/recv | 0.0684s | 0.0638s | 0.0654s | 0.0638s |
scatter/gather | 0.0367s | 0.0492s | 0.0765s | 0.1283s |
The time for transferring data of scatter/gather increases so fast, do I still have a reason to use it instead of a loop of send/recv? I know scatter is a wrap of send and gather is a wrap of recv, but what do they do besides?
For clarification, both MPI_Scatter and MPI_Gather (most probably) use MPI_Send AND MPI_Recv under the hood.
From your code examples it seems like you don't really understand how MPI works:
You don't need to perform a receive or send operation to "establish a connection". MPI operations will generally take care of that implicitly.
In your Gather/Scatter example, you first distribute data with
MPI_Scatter
, then you broadcast some more data withMPI_Bcast
, and then you just collect data again withMPI_Gather
, without doing any computation in between.In your examples, you don't need any explicit synchronization with
MPI_Barrier
.You'll see a big performance increase once you structure your program correctly. Apart from those issues, there is also one with MPI: Unfortunately, the MPI standard does not give any performance guarantees but leaves it to the actual implementation to do the best possible. MPI_Scatter/Gather, depending on the actual implementation you use, my try to optimize for large messages and/or a large number of processes - which, naturally, comes with some overhead.
You could try a different MPI implementation (for open-source, see e.g., MVARPICH) to see if the one you are using right now is just doing a bad job. However, investigating that will only make sense once you get your code right.
Also, it'S better not to use the prefix
MPI_
. It makes your code hard to read and if I am not mistaken the MPI standard reserves the prefix for MPI library functions.