I am trying to make an SVD function using Scalapack which utilizes a process grid to distribute an input matrix across my processors. When I run the program, if I set the M and N variables, that is the number of rows and columns of the input matrix, the program will go all the way through, including a query of PDGESVD where I tell the program to find the size of the work array, until I get to the 2nd call, where the program will never return. I've had the program sit for over 30 minutes before eventually timing out.
To note, I am using INTEL MKL for Scalapack, BLACS, etc.
I have compiled my program as follows
mpicxx mycode.cpp -m64 -I$MKL_ROOT/include -L$MKL_ROOT/lib/intel64 -lmkl_scalapack_lp64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -ldl -o myExec
I run the program as such in a slurm file
#SBATCH -N 4
#SBATCH -n 4
#SBATCH -t 00:01:00
module list
pwd
date
ibrun ./myExec
ibrun is used because I am using a network of supercomputers that has its own syntax regarding running mpi related code. You should be able to use mpirun instead, as ibrun is jsut a generic replacement that does the same thing
Here is my code
#include <iostream>
#include <iomanip>
#include <stdio.h>
#include <stdlib.h>
#include <mkl.h>
#include <mkl_blas.h>
#include <mkl_blacs.h>
#include <mkl_pblas.h>
#include <mkl_cblas.h>
#include <mkl_scalapack.h>
#include <mpi.h>
using namespace std;
int main()
{
MKL_INT mpid;
MKL_INT numproc;
blacs_pinfo_(&mpid, &numproc);
const MKL_INT zero = 0;
const MKL_INT one = 1;
const double dzero = 0.0E+0;
const double done = 1.0E+0;
const MKL_INT maxs = 1024;
MKL_INT m = 8; //number of rows in my input matrix
MKL_INT n = m; //number of columns in my input matrix
MKL_INT mb = 4; //blocking factor
MKL_INT nb = mb;
MKL_INT nprow = 2; //utilizing a 2x2 process grid
MKL_INT npcol = 2;
MKL_INT myprow;
MKL_INT mypcol;
MKL_INT contxt_sys;
blacs_get_(&zero, &zero, &contxt_sys); //get default context
MKLINT contxt_all = contxt_sys;
blacs_gridinit_(&contxt, "R", &one, &numproc);
MKL_INT contxt = contxt_sys;
blacs_gridinit(&contxt, "R", &nprow, &npcol); // create 2x2 process grid
MKL_INT contxt_zero = contxt_sys;
blacs_gridinit_(&contxt_zero, "R", &one, &one); //create grid with one process for later
blacs_gridinfo_(&contxt, &nprow, &npcol, &myprow, &mypcol); //get process row and column data
MKL_INT lld = max(1, numroc_(&m, &mb, &myprow, &zero, &nprow));
MKL_INT desca[9];
MKL_INT descatwo[9];
MKL_INT desca_zero[9];
MKL_INT descatwo_zero[9];
MKL_INT descu[9];
MKL_INT descu_zero[9];
MKL_INT descvt[9];
MKL_INT descvt_zero[9];
MKL_INT info;
//create array descriptors for local arrays
descinit_(desca, &m, &n, &mb, &nb, &zero, &zero, &contxt, &lld, &info);
descinit_(descatwo, &m, &n, &mb, &nb, &zero, &zero, &contxt, &lld, &info);
descinit_(descu, &m, &n, &mb, &nb, &zero, &zero, &contxt, &lld, &info);
descinit_(descvt, &m, &n, &mb, &nb, &zero, &zero, &contxt, &lld, &info);
//exclude these descriptors from contexts it is not part of
desca_zero[1] = -1;
descatwo_zero[1] = -1;
descu_zero[1] = -1;
descvt_zero[1] = -1;
//allocate space for our local arrays
//atwo will hold a copy of a, since PDGESVD destroys a on exit and we need to do a workspace query prior, may not be needed, havent investigated
double *a = (double *)malloc((maxs*maxs)*sizeof(double));
double *atwo = (double *)malloc((maxs*maxs)*sizeof(double));
double *u = (double *)malloc((maxs*maxs)*sizeof(double));
double *vt = (double *)malloc((maxs*maxs)*sizeof(double));
double *s = (double *)malloc((maxs)*sizeof(double));
//initialize global arrays in a way that keeps them only in process 0
double *a_zero;
double *atwo_zero;
double *u_zero;
double *vt_zero;
if(mpid==0)
{
descinit_(desca_zero, &m, &n, &m, &n, &zero, &zero, &contxt_zero, &m, &info);
descinit_(descatwo_zero, &m, &n, &m, &n, &zero, &zero, &contxt_zero, &m, &info);
descinit_(descu_zero, &m, &n, &m, &n, &zero, &zero, &contxt_zero, &m, &info);
descinit_(descvt_zero, &m, &n, &m, &n, &zero, &zero, &contxt_zero, &m, &info);
}
MKL_INT counter = 1;
if(mpid == 0)
{
a_zero = (double *)malloc((maxs*maxs)*sizeof(double));
atwo_zero = (double *)malloc((maxs*maxs)*sizeof(double));
u_zero = (double *)malloc((maxs*maxs)*sizeof(double));
vt_zero = (double *)malloc((maxs*maxs)*sizeof(double));
for(MKL_INT i = 0;i<m;i++)
{
for(MKL_INT j = 0;j<n;j++)
{
*(a_zero+i*m+j) = counter;
*(atwo_zero+i*m+j) = *(a_zero+i*m+j);
*(u_zero+i*m+j) = counter;
*(vt_zero+i*m+j) = counter;
counter++;
}
}
}
else
{
a_zero = NULL;
atwo_zero = NULL;
u_zero = NULL;
vt_zero = NULL;
}
blacs_barrier_(&contxt, "A");
//distribute global matrices to process grid
pdgemr2d(&m, &n, a_zero, &one, &one, desca_zero, a, &one, &one, desca, &contxt);
pdgemr2d(&m, &n, atwo_zero, &one, &one, descatwo_zero, atwo, &one, &one, descatwo, &contxt);
pdgemr2d(&m, &n, u_zero, &one, &one, descu_zero, u, &one, &one, descu, &contxt);
pdgemr2d(&m, &n, vt_zero, &one, &one, descvt_zero, vt, &one, &one, descvt, &contxt);
blacs_barrier_(&contxt, "A");
MKL_INT lwork = -1;
double dwork;
pdgesvd_("V", "V", &m, &n, a, &one, &one, desca, s. u, &one, &one, descu, vt, &one, &one, descvt, &dwork, &lwork, &info);
blacs_barrier_(&contxt, "A");
lwork = (MKL_INT)dwork;
double *work = (double *)malloc(lwork*sizeof(double));
blacs_barrier_(&contxt, "A");
//from here, this is where my program stall indefinitely
pdgesvd_("V", "V", &m, &n, atwo, &one, &one, descatwo, s, u, &one, &one, descu, vt, &one, &one, descvt, work, &lwork, &info);
cout << "SVD Finished" << endl;
blacs_barrier_(&contxt, "A");
//I know that I have not re gathered the local matrices into the global, but I merely have it in this state since I never get past the 2nd pdgesvd call
free(work);
free(s);
free(a);
free(atwo);
free(u);
free(vt);
if(mpid==0)
{
free(a_zero);
free(atwo_zero);
free(u_zero);
free(vt_zero);
}
blacs_exit_(&zero);
}
I should be getting past the SVD function, seeing a "SVD Finished" line in my output, however the program stalls and timesout.
I have tried different implementations of distributing the matrix across the process grid, but all lead to the same result.
Notably, setting M and N to be lower than the size of my input matrix does allow the function to work, but it does not operate across the entire matrix.