// Hypercubic all_to_all function adapted to the matrix transpose operation
void HPC_Alltoall_H(void * sbuf, int scount, MPI_Datatype stype,
void * rbuf, int rcount, MPI_Datatype rtype, MPI_Comm comm) {
int rank, size;
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &size);
int n = static_cast<int>(std::sqrt(scount * size));
int blockSize = n/size;
// std::cout << blockSize << " ";
int* stored_message = static_cast<int *>(sbuf);
std::vector<int> temp(stored_message, stored_message + scount);
std::vector<int> in_sendBuffer(blockSize * blockSize*size/2);
std::vector<int> in_recvBuffer(blockSize * blockSize*size/2);
int dims = (int)log2(size); // Calculate the number of dimensions
for (int i = dims - 1; i >= 0; --i) {
int diff = 1 << i;
int partner = rank ^ (diff); // Calculate the partner's rank for this phase
// For processors with smaller rank
if (rank < partner) {
// select data from temp to in_send
for (int j = 0; j < size/2/diff; j++) {
for (int k = 0; k < blockSize * blockSize * diff; k++) {
in_sendBuffer.push_back(temp[j * 2 * diff * blockSize * blockSize + diff * blockSize * blockSize + k]);
}
}
std::cout << rank << partner << std::endl;
// Perform the data exchange with the partner
MPI_Sendrecv(in_sendBuffer.data(), blockSize * blockSize * size / 2, MPI_INT, rank, 0,
in_recvBuffer.data(), blockSize * blockSize * size / 2, MPI_INT, partner, 0,
comm, MPI_STATUS_IGNORE);
std::cout << "Finish small rank send select" << std::endl;
// select data from in_recv to temp
for (int j = 0; j < size/2/diff; j++) {
for (int k = 0; k < blockSize * blockSize * diff; k++) {
temp[j * 2 * diff * blockSize * blockSize + diff * blockSize * blockSize + k] = in_recvBuffer[j * diff * blockSize * blockSize + k];
}
}
std::cout << "Finish small rank recv select" << std::endl;
} else {
// select data from temp to in_send
for (int j = 0; j < size/2/diff; j++) {
for (int k = 0; k < blockSize * blockSize * diff; k++) {
in_sendBuffer.push_back(temp[j * 2 * diff * blockSize * blockSize + k]);
}
}
// Perform the data exchange with the partner
MPI_Sendrecv(in_sendBuffer.data(), blockSize * blockSize * size / 2, MPI_INT, rank, 0,
in_recvBuffer.data(), blockSize * blockSize * size / 2, MPI_INT, partner, 0,
comm, MPI_STATUS_IGNORE);
std::cout << "Finish large rank send select" << std::endl;
// select data from in_recv to temp
for (int j = 0; j < size/2/diff; j++) {
for (int k = 0; k < blockSize * blockSize * diff; k++) {
temp[j * 2 * diff * blockSize * blockSize + k] = in_recvBuffer[j * diff * blockSize * blockSize + k];
}
}
std::cout << "Finish large rank recv select" << std::endl;
}
}
std::cout << "Finish temp" << std::endl;
for (int i = 0; i < blockSize * blockSize * size; i++) {
static_cast<int *>(rbuf)[i] = temp[i];
}
std::cout << "Finish." << std::endl;
}
I am trying to implement the hypercubic permutation for an All to all collective communication to achieve matrix transpose, but when I run the code,it stuck at the function MPI_Sendrecv.
The rank and partner are matched correctly, the buffer size are also correct, I cannot figure out what is the problem that causes the MPI_Sendrecv stuck.
I print the rank and partner numbers, they are matched for 8 processors program. algorithm visualization