I recently had the need to compute an SVD from some C code. Given the stability and wide acceptance of LAPACK I decided to use that. The code seemed to be running a lot slower than I thought it should. I've been led to believe that Python and Numpy compute SVD via LAPACK, so I decided to compare speeds against Python. To my suprise the Python code is much faster at computing the SVD than the C code. In fact, the Python code can compute a full SVD about as fast as the C/LAPACK code computes half of an SVD.
This is my first real experience callling Fortran from C, so I suppose I am doing something wrong that is causing the slowdown, but I have no idea what it is.
Question: Why is the Python code so much faster than the C code?
In the end I need a faster SVD in C, so the follow-on question is:
Question: Is there a stable performant C library for computing partial SVD (preferrably of sparse matrices).
Here is some minimal code for timing tests. The matrix used doesn't have any particular meaning, i just needed something that I could easily generate in both C and Python.
test_svd.c
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
extern void dgesvdx(
char* jobu,
char* jobv,
char* range,
int* m,
int* n,
double* a,
int* lda,
double* vl,
double* vu,
int* il,
int* iu,
int* ns,
double* s,
double* u,
int* ldu,
double* vt,
int* ldvt,
double* work,
int* lwork,
int* iwork,
int* info);
extern void dgesdd(
char* jobz,
int* m,
int* n,
double* a,
int* lda,
double* s,
double* u,
int* ldu,
double* vt,
int* ldvt,
double* work,
int* lwork,
int* iwork,
int* info);
typedef enum bool {false, true} bool;
bool svd(double* a,
int num_rows,
int num_cols,
int num_sing_vals,
double* u,
double* s,
double* vt,
bool use_dgesdd) {
double* a_work;
int* iwork;
double* work;
int lwork;
int min_m_n;
int info = 0;
double vl = 0;
double vu = 1;
int il = 1;
int iu = num_sing_vals;
int ns = num_sing_vals;
//////////////////////////////////////////////////////
// Compute and allocate optimal workspace
//////////////////////////////////////////////////////
min_m_n = (num_cols < num_rows) ? num_cols : num_rows;
lwork = -1;
// copy a to prevent corruption
a_work = (double*)malloc(num_cols*num_rows*sizeof(double));
// compute optimal workspace (lwork)
double worksize;
if(use_dgesdd) {
iwork = (int*)malloc(7*(min_m_n)*sizeof(int));
dgesdd("S",&num_cols,&num_rows,a_work,&num_cols,s,vt,&num_cols,u,&num_sing_vals,&worksize,&lwork,iwork,&info);
} else {
iwork = (int*)malloc(24*(min_m_n)*sizeof(int));
dgesvdx("V","V","I",&num_cols,&num_rows,a_work,&num_cols,&vl,&vu,&il,&iu,&ns,s,vt,&num_cols,u,&num_sing_vals,&worksize,&lwork,iwork,&info);
}
if(info) {
perror("error computing lwork\n");
return(false);
}
// allocate work
lwork = (int)worksize;
work = (double*)malloc(lwork*sizeof(double));
//////////////////////////////////////////////////////
// Compute the svd
//////////////////////////////////////////////////////
memcpy(a_work,a,num_cols*num_rows*sizeof(double));
if(use_dgesdd) {
dgesdd("S",&num_cols,&num_rows,a_work,&num_cols,s,vt,&num_cols,u,&num_sing_vals,work,&lwork,iwork,&info);
} else {
ns = num_sing_vals;
dgesvdx("V","V","I",&num_cols,&num_rows,a_work,&num_cols,&vl,&vu,&il,&iu,&ns,s,vt,&num_cols,u,&num_sing_vals,work,&lwork,iwork,&info);
}
if(info<0) {
perror("invalid argument in SVD\n");
return(false);
} else if(info>0) {
printf("SVD did not converge");
return(false);
}
free(iwork);
iwork = NULL;
free(work);
work = NULL;
free(a_work);
a_work = NULL;
return(true);
}
int main(int argc, char** argv) {
int num_rows = 1000;
int num_cols = 1000;
int size = num_rows*num_cols;
double* A = (double*)calloc(num_rows*num_cols,sizeof(double));
for(int r = 0; r < num_rows; ++r) {
for(int c = 0; c < num_cols; ++c) {
A[r*num_cols+c] = sin(5*M_PI*(r*num_cols+c)/((double)size))+cos(10*M_PI*(c*num_rows+c)/((double)size));
if(fabs(A[r*num_cols+c])<0.75 || fabs(A[r*num_cols+c]) > 0.9) {
A[r*num_cols+c] = 0;
}
}
}
int num_sing_vals = (num_rows < num_cols) ? num_rows : num_cols;
double* u_sdd = (double*)calloc(num_rows*num_sing_vals,sizeof(double));
double* s_sdd = (double*)calloc(num_sing_vals,sizeof(double));
double* vt_sdd = (double*)calloc(num_sing_vals*num_cols,sizeof(double));
clock_t tic = clock();
bool success = svd(A,num_rows,num_cols,num_sing_vals,u_sdd,s_sdd,vt_sdd,true);
clock_t toc = clock();
printf("full dgesdd %lf seconds\n",(double)(toc-tic)/CLOCKS_PER_SEC);
double* u_svd = (double*)calloc(num_rows*num_sing_vals,sizeof(double));
double* s_svd = (double*)calloc(num_sing_vals,sizeof(double));
double* vt_svd = (double*)calloc(num_sing_vals*num_cols,sizeof(double));
tic = clock();
success = svd(A,num_rows,num_cols,num_sing_vals,u_svd,s_svd,vt_svd,false);
toc = clock();
printf("full dgesvdx %lf seconds\n",(double)(toc-tic)/CLOCKS_PER_SEC);
int first_few_sv = 200;
double* u_partial_svd = (double*)calloc(num_rows*first_few_sv,sizeof(double));
double* s_partial_svd = (double*)calloc(num_sing_vals,sizeof(double));
double* vt_partial_svd = (double*)calloc(first_few_sv*num_cols,sizeof(double));
tic = clock();
success = svd(A,num_rows,num_cols,first_few_sv,u_partial_svd,s_partial_svd,vt_partial_svd,false);
toc= clock();
printf("partial dgesvdx %lf seconds\n",(double)(toc-tic)/CLOCKS_PER_SEC);
free(A);
free(u_sdd);
free(s_sdd);
free(vt_sdd);
free(u_svd);
free(s_svd);
free(vt_svd);
free(u_partial_svd);
free(s_partial_svd);
free(vt_partial_svd);
return 0;
}
compiling with:
gcc test_svd.c -O3 -lm -llapack -Ddgesdd=dgesdd_ -Ddgesvdx=dgesvdx_
This gives me the following timing outputs:
full dgesdd 8.003578 seconds
full dgesvdx 17.077815 seconds
partial dgesvdx 1.498775 seconds
Comparing with the following python code:
test_svd.py
import numpy, time
dim = 1000
term1 = numpy.sin((5*numpy.pi*numpy.arange(dim**2).reshape(dim,dim))/(dim**2))
term2 = numpy.cos((10*numpy.pi*numpy.arange(dim**2).reshape(dim,dim).T)/(dim**2))
A = term1+term2
A = numpy.where(numpy.abs(A)<0.75,0,A)
A = numpy.where(numpy.abs(A)>0.9,0,A)
tic = time.time()
U,S,Vt = numpy.linalg.svd(A)
toc = time.time()
print "full svd", toc - tic, "seconds"
gives:
python2 test_svd.py
full svd 2.4460 seconds
The actual singluar values agree. However, the fact that a full SVD in Python is comparable in time to a partial SVD in C is puzzling to me.
In case it matters my python is Python 2.7.12 compiled with GCC 6.2.1 and my gcc is 6.2.1