Scalapack Routine PDGESVD_ does not return when called

93 views Asked by At

I am trying to make an SVD function using Scalapack which utilizes a process grid to distribute an input matrix across my processors. When I run the program, if I set the M and N variables, that is the number of rows and columns of the input matrix, the program will go all the way through, including a query of PDGESVD where I tell the program to find the size of the work array, until I get to the 2nd call, where the program will never return. I've had the program sit for over 30 minutes before eventually timing out.

To note, I am using INTEL MKL for Scalapack, BLACS, etc.

I have compiled my program as follows

mpicxx mycode.cpp -m64 -I$MKL_ROOT/include -L$MKL_ROOT/lib/intel64 -lmkl_scalapack_lp64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm -ldl -o myExec

I run the program as such in a slurm file

#SBATCH -N 4
#SBATCH -n 4
#SBATCH -t 00:01:00

module list
pwd
date

ibrun ./myExec

ibrun is used because I am using a network of supercomputers that has its own syntax regarding running mpi related code. You should be able to use mpirun instead, as ibrun is jsut a generic replacement that does the same thing

Here is my code

#include <iostream>
#include <iomanip>
#include <stdio.h>
#include <stdlib.h>
#include <mkl.h>
#include <mkl_blas.h>
#include <mkl_blacs.h>
#include <mkl_pblas.h>
#include <mkl_cblas.h>
#include <mkl_scalapack.h>
#include <mpi.h>

using namespace std;

int main()
{
     MKL_INT mpid;
     MKL_INT numproc;
     blacs_pinfo_(&mpid, &numproc);

     const MKL_INT zero = 0;
     const MKL_INT one = 1;
     const double dzero = 0.0E+0;
     const double done = 1.0E+0;
     const MKL_INT maxs = 1024;

     MKL_INT m = 8; //number of rows in my input matrix
     MKL_INT n = m; //number of columns in my input matrix
     MKL_INT mb = 4; //blocking factor
     MKL_INT nb = mb;
     MKL_INT nprow = 2; //utilizing a 2x2 process grid
     MKL_INT npcol = 2;
     MKL_INT myprow;
     MKL_INT mypcol; 

     MKL_INT contxt_sys;
     blacs_get_(&zero, &zero, &contxt_sys); //get default context
     MKLINT contxt_all = contxt_sys;
     blacs_gridinit_(&contxt, "R", &one, &numproc);

     MKL_INT contxt = contxt_sys;
     blacs_gridinit(&contxt, "R", &nprow, &npcol); // create 2x2 process grid
     
     MKL_INT contxt_zero = contxt_sys;
     blacs_gridinit_(&contxt_zero, "R", &one, &one); //create grid with one process for later
     blacs_gridinfo_(&contxt, &nprow, &npcol, &myprow, &mypcol); //get process row and column data

     MKL_INT lld = max(1, numroc_(&m, &mb, &myprow, &zero, &nprow));
     MKL_INT desca[9];
     MKL_INT descatwo[9]; 
     MKL_INT desca_zero[9];
     MKL_INT descatwo_zero[9];
     MKL_INT descu[9];
     MKL_INT descu_zero[9];
     MKL_INT descvt[9];
     MKL_INT descvt_zero[9];
     MKL_INT info;

     //create array descriptors for local arrays
     descinit_(desca, &m, &n, &mb, &nb, &zero, &zero, &contxt, &lld, &info);
     descinit_(descatwo, &m, &n, &mb, &nb, &zero, &zero, &contxt, &lld, &info);
     descinit_(descu, &m, &n, &mb, &nb, &zero, &zero, &contxt, &lld, &info);
     descinit_(descvt, &m, &n, &mb, &nb, &zero, &zero, &contxt, &lld, &info);

     //exclude these descriptors from contexts it is not part of
     desca_zero[1] = -1;
     descatwo_zero[1] = -1;
     descu_zero[1] = -1;
     descvt_zero[1] = -1;

     //allocate space for our local arrays
     //atwo will hold a copy of a, since PDGESVD destroys a on exit and we need to do a workspace query prior, may not be needed, havent investigated

     double *a = (double *)malloc((maxs*maxs)*sizeof(double));
     double *atwo = (double *)malloc((maxs*maxs)*sizeof(double));
     double *u = (double *)malloc((maxs*maxs)*sizeof(double));
     double *vt = (double *)malloc((maxs*maxs)*sizeof(double));
     double *s = (double *)malloc((maxs)*sizeof(double));

     //initialize global arrays in a way that keeps them only in process 0
     double *a_zero;
     double *atwo_zero;
     double *u_zero;
     double *vt_zero;

     if(mpid==0)
     {
          descinit_(desca_zero, &m, &n, &m, &n, &zero, &zero, &contxt_zero, &m, &info);
          descinit_(descatwo_zero, &m, &n, &m, &n, &zero, &zero, &contxt_zero, &m, &info);
          descinit_(descu_zero, &m, &n, &m, &n, &zero, &zero, &contxt_zero, &m, &info);
          descinit_(descvt_zero, &m, &n, &m, &n, &zero, &zero, &contxt_zero, &m, &info);
     }

     MKL_INT counter = 1;
     if(mpid == 0)
     {
          a_zero = (double *)malloc((maxs*maxs)*sizeof(double));
          atwo_zero = (double *)malloc((maxs*maxs)*sizeof(double));
          u_zero = (double *)malloc((maxs*maxs)*sizeof(double));
          vt_zero = (double *)malloc((maxs*maxs)*sizeof(double));
          for(MKL_INT i = 0;i<m;i++)
          {
               for(MKL_INT j = 0;j<n;j++)
               {
                    *(a_zero+i*m+j) = counter;
                    *(atwo_zero+i*m+j) = *(a_zero+i*m+j);
                    *(u_zero+i*m+j) = counter;
                    *(vt_zero+i*m+j) = counter;
                    counter++;
               }
          }
     }
     else
     {
          a_zero = NULL;
          atwo_zero = NULL;
          u_zero = NULL;
          vt_zero = NULL;
     }

     blacs_barrier_(&contxt, "A");

     //distribute global matrices to process grid
     pdgemr2d(&m, &n, a_zero, &one, &one, desca_zero, a, &one, &one, desca, &contxt);
     pdgemr2d(&m, &n, atwo_zero, &one, &one, descatwo_zero, atwo, &one, &one, descatwo, &contxt);
     pdgemr2d(&m, &n, u_zero, &one, &one, descu_zero, u, &one, &one, descu, &contxt);
     pdgemr2d(&m, &n, vt_zero, &one, &one, descvt_zero, vt, &one, &one, descvt, &contxt);

     blacs_barrier_(&contxt, "A");

     MKL_INT lwork = -1;
     double dwork;

     pdgesvd_("V", "V", &m, &n, a, &one, &one, desca, s. u, &one, &one, descu, vt, &one, &one, descvt, &dwork, &lwork, &info);
     blacs_barrier_(&contxt, "A");
     lwork = (MKL_INT)dwork;

     double *work = (double *)malloc(lwork*sizeof(double));

     blacs_barrier_(&contxt, "A");

     //from here, this is where my program stall indefinitely
     pdgesvd_("V", "V", &m, &n, atwo, &one, &one, descatwo, s, u, &one, &one, descu, vt, &one, &one, descvt, work, &lwork, &info);

     cout << "SVD Finished" << endl;

     blacs_barrier_(&contxt, "A");

     //I know that I have not re gathered the local matrices into the global, but I merely have it in this state since I never get past the 2nd pdgesvd call

     free(work);
     free(s);
     free(a);
     free(atwo);
     free(u);
     free(vt);
     if(mpid==0)
     {
          free(a_zero);
          free(atwo_zero);
          free(u_zero);
          free(vt_zero);
     }
     blacs_exit_(&zero);
}

I should be getting past the SVD function, seeing a "SVD Finished" line in my output, however the program stalls and timesout.

I have tried different implementations of distributing the matrix across the process grid, but all lead to the same result.

Notably, setting M and N to be lower than the size of my input matrix does allow the function to work, but it does not operate across the entire matrix.

0

There are 0 answers