The problem is to compute the determinant of a complex matrix (5x5, 4x4, and 3x3) from a device function within a particular thread. Efficiency is crucial, as the original problem involves computing thousands of such matrices in a for loop within each thread.
Here is a working example using direct expansion (run with arg: 0) and cofactor expansion (run with arg: 1) methods:
#include <stdio.h>
#include <complex.h>
#include <cstring>
#include <cuda.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define N 5
__host__ __device__ static __inline__ cuDoubleComplex cuCfms( cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex d){
double real_res;
double imag_res;
real_res = (cuCreal(x) * cuCreal(y)) - cuCreal(d);
imag_res = (cuCreal(x) * cuCimag(y)) - cuCimag(d);
real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
return make_cuDoubleComplex(real_res, imag_res);
}
__host__ __device__ static __inline__ cuDoubleComplex cuCaaaa(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v) {
return make_cuDoubleComplex (cuCreal(x) + cuCreal(y) + cuCreal(u) + cuCreal(v),
cuCimag(x) + cuCimag(y) + cuCimag(u) + cuCimag(v));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCasas(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v) {
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v),
cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v));
}
__host__ __device__ static __inline__ cuDoubleComplex cuCasasa(cuDoubleComplex x,
cuDoubleComplex y,
cuDoubleComplex u,
cuDoubleComplex v,
cuDoubleComplex w) {
return make_cuDoubleComplex (cuCreal(x) - cuCreal(y) + cuCreal(u) - cuCreal(v) + cuCreal(w),
cuCimag(x) - cuCimag(y) + cuCimag(u) - cuCimag(v) + cuCimag(w) );
}
__device__ cuDoubleComplex cuC_5pos (cuDoubleComplex a, cuDoubleComplex b,
cuDoubleComplex c, cuDoubleComplex d,
cuDoubleComplex e) {
// B[12]*(B[18]*B[24] - B[19]*B[23])
//a*b*c -d*e
return cuCmul(a, cuCfms(b,c, cuCmul(d,e)));
}
__device__ cuDoubleComplex cuC_5neg (cuDoubleComplex a, cuDoubleComplex b,
cuDoubleComplex c, cuDoubleComplex d,
cuDoubleComplex e) {
// B[12]*(B[18]*B[24] - B[19]*B[23])
//a*b*c -d*e
return cuCmul(cuCmul(make_cuDoubleComplex(-1.0,0.0),a), cuCfms(b,c, cuCmul(d,e)));
}
__device__ cuDoubleComplex direct_det5 (cuDoubleComplex *B){
cuDoubleComplex det;
cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det5 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det11 = cuCmul(B[0],cuCmul(B[6],cuCadd( cuC_5pos(B[12],B[18],B[24],B[19],B[23]),
cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]),
cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det12 = cuCmul(B[0],cuCmul(B[11],cuCadd(cuC_5pos(B[7],B[18],B[24],B[19],B[23]),
cuCadd( cuC_5neg(B[8],B[17],B[24],B[22],B[19]),
cuC_5pos(B[9],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det13 = cuCmul(B[0],cuCmul(B[16],cuCadd(cuC_5pos(B[7],B[13],B[24],B[23],B[14]),
cuCadd( cuC_5neg(B[8],B[12],B[24],B[22],B[14]),
cuC_5pos(B[9],B[12],B[23],B[22],B[13]) ) ) ) );
cuDoubleComplex det14 = cuCmul(B[0],cuCmul(B[21],cuCadd(cuC_5pos(B[7],B[13],B[19],B[18],B[14]),
cuCadd( cuC_5neg(B[8],B[12],B[19],B[17],B[14]),
cuC_5pos(B[9],B[12],B[18],B[13],B[17]) ) ) ) );
det1 = cuCasas(det11, det12, det13, det14);
cuDoubleComplex det21 = cuCmul(B[1*5+0],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[12],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[13],B[17],B[24],B[22],B[19]), cuC_5pos(B[14],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det22 = cuCmul(B[1*5+0],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[2],B[18],B[24],B[19],B[23]), cuCadd( cuC_5neg(B[3],B[17],B[24],B[22],B[19]), cuC_5pos(B[4],B[17],B[23],B[22],B[18]) ) ) ) );
cuDoubleComplex det23 = cuCmul(B[1*5+0],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[2],B[13],B[24],B[23],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[24],B[22],B[14]), cuC_5pos(B[4],B[12],B[23],B[22],B[13]) ) ) ) );
cuDoubleComplex det24 = cuCmul(B[1*5+0],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[2],B[13],B[19],B[18],B[14]), cuCadd( cuC_5neg(B[3],B[12],B[19],B[17],B[14]), cuC_5pos(B[4],B[12],B[18],B[13],B[17]) ) ) ) );
det2 = cuCasas(det21, det22, det23, det24);
cuDoubleComplex det31 = cuCmul(B[10],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[1*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]),
cuC_5pos(B[1*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
cuDoubleComplex det32 = cuCmul(B[10],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[3*5+3],B[4*5+4],B[3*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[0*5+3],B[3*5+2],B[4*5+4],B[4*5+2],B[3*5+4]),
cuC_5pos(B[0*5+4],B[3*5+2],B[4*5+3],B[4*5+2],B[3*5+3]) ) ) ) );
cuDoubleComplex det33 = cuCmul(B[10],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det34 = cuCmul(B[10],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[1*5+3],B[3*5+2]) ) ) ) );
det3 = cuCasas(det31, det32, det33, det34);
cuDoubleComplex det41 = cuCmul(B[15],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[1*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]),
cuC_5pos(B[1*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det42 = cuCmul(B[15],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[4*5+4],B[2*5+4],B[4*5+3]),
cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[4*5+4],B[4*5+2],B[2*5+4]),
cuC_5pos(B[0*5+4],B[2*5+2],B[4*5+3],B[4*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det43 = cuCmul(B[15],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[4*5+4],B[4*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[4*5+4],B[4*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[4*5+3],B[4*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det44 = cuCmul(B[15],cuCmul(B[4*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
det4 = cuCasas(det41, det42, det43, det44);
cuDoubleComplex det51 = cuCmul(B[20],cuCmul(B[0*5+1],cuCadd(cuC_5pos(B[1*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]),
cuCadd( cuC_5neg(B[1*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]),
cuC_5pos(B[1*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det52 = cuCmul(B[20],cuCmul(B[1*5+1],cuCadd(cuC_5pos(B[0*5+2],B[2*5+3],B[3*5+4],B[2*5+4],B[3*5+3]),
cuCadd( cuC_5neg(B[0*5+3],B[2*5+2],B[3*5+4],B[3*5+2],B[2*5+4]),
cuC_5pos(B[0*5+4],B[2*5+2],B[3*5+3],B[3*5+2],B[2*5+3]) ) ) ) );
cuDoubleComplex det53 = cuCmul(B[20],cuCmul(B[2*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[3*5+4],B[3*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[3*5+4],B[3*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[3*5+3],B[3*5+2],B[1*5+3]) ) ) ) );
cuDoubleComplex det54 = cuCmul(B[20],cuCmul(B[3*5+1],cuCadd(cuC_5pos(B[0*5+2],B[1*5+3],B[2*5+4],B[2*5+3],B[1*5+4]),
cuCadd( cuC_5neg(B[0*5+3],B[1*5+2],B[2*5+4],B[2*5+2],B[1*5+4]),
cuC_5pos(B[0*5+4],B[1*5+2],B[2*5+3],B[1*5+3],B[2*5+2]) ) ) ) );
det5 = cuCasas(det51, det52, det53, det54);
det = cuCasasa(det1,det2,det3,det4,det5);
return det;
}
#define N 5
__device__ cuDoubleComplex det_4x4(cuDoubleComplex *matrix) {
cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det1 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det2 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det3 = make_cuDoubleComplex(0.0,0.0);
cuDoubleComplex det4 = make_cuDoubleComplex(0.0,0.0);
// Calculate the determinant using cofactor expansions
det1 = cuCmul(matrix[0],
cuCadd( cuCsub( cuCmul( matrix[5],
cuCsub( cuCmul(matrix[10],matrix[15]),
cuCmul(matrix[11],matrix[14]) ))
,cuCmul(matrix[6],
cuCsub( cuCmul(matrix[9],matrix[15]),
cuCmul(matrix[11],matrix[13]) ) ) )
,cuCmul(matrix[7],
cuCsub( cuCmul(matrix[9],matrix[14]),
cuCmul(matrix[10],matrix[13]) ) ) ) );
det2 = cuCmul(matrix[1],
cuCadd( cuCsub( cuCmul( matrix[4],
cuCsub( cuCmul(matrix[10],matrix[15]),
cuCmul(matrix[11],matrix[14]) ))
,cuCmul(matrix[6],
cuCsub( cuCmul(matrix[8],matrix[15]),
cuCmul(matrix[11],matrix[12]) ) ) )
,cuCmul(matrix[7],
cuCsub( cuCmul(matrix[8],matrix[14]),
cuCmul(matrix[10],matrix[12]) ) ) ) );
det3 = cuCmul(matrix[2],
cuCadd( cuCsub( cuCmul( matrix[4],
cuCsub( cuCmul(matrix[9],matrix[15]),
cuCmul(matrix[11],matrix[13]) ))
,cuCmul(matrix[5],
cuCsub( cuCmul(matrix[8],matrix[15]),
cuCmul(matrix[11],matrix[12]) ) ) )
,cuCmul(matrix[7],
cuCsub( cuCmul(matrix[8],matrix[13]),
cuCmul(matrix[9],matrix[12]) ) ) ) );
det4 = cuCmul(matrix[3],
cuCadd( cuCsub( cuCmul( matrix[4],
cuCsub( cuCmul(matrix[9],matrix[14]),
cuCmul(matrix[10],matrix[13]) ))
,cuCmul(matrix[5],
cuCsub( cuCmul(matrix[8],matrix[14]),
cuCmul(matrix[10],matrix[12]) ) ) )
,cuCmul(matrix[6],
cuCsub( cuCmul(matrix[8],matrix[13]),
cuCmul(matrix[9],matrix[12]) ) ) ) );
det = cuCsub( cuCadd(det1,det3), cuCadd(det2,det4) );
return det;
}
__device__ cuDoubleComplex det_5x5(cuDoubleComplex *matrix) {
cuDoubleComplex det = make_cuDoubleComplex(0.0,0.0);
// Calculate the determinant using cofactor expansions
for (int col = 0; col < N; col++) {
cuDoubleComplex submatrix[N-1][N-1];
int subrow = 0, subcol = 0;
// Create the submatrix by excluding the current row and column
for (int row = 1; row < N; row++) {
for (int c = 0; c < N; c++) {
if (c == col) {
continue;
}
submatrix[subrow][subcol] = matrix[row * N + c];
subcol++;
}
subrow++;
subcol = 0;
}
// Calculate the cofactor
cuDoubleComplex det_sign = make_cuDoubleComplex((col % 2 == 0 ? 1 : -1),0.0);
cuDoubleComplex cofactor = cuCmul( cuCmul(det_sign,matrix[col]), det_4x4(&submatrix[0][0]));
// Accumulate the cofactors to calculate the determinant
det = cuCadd(det, cofactor);
}
return det;
}
__device__ cuDoubleComplex gpu_device_kernel(int i, int method) {
// some random data, depending on `i`, but for simplification say constant data
cuDoubleComplex matrix[5*5] = {
make_cuDoubleComplex(2.0*(i/69120 * 1000), 2.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(7.0, 0.0),
make_cuDoubleComplex(4.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(8.0, 0.0),
make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(3.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(2.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0),
make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(1.0, 0.0), make_cuDoubleComplex(2.0, 0.0),
make_cuDoubleComplex(5.0, 0.0), make_cuDoubleComplex(6.0*(i/69120 * 1000), 0.0), make_cuDoubleComplex(4.0, 0.0), make_cuDoubleComplex(3.0, 0.0), make_cuDoubleComplex(2.0, 2.0)
};
cuDoubleComplex cf_sign[5]={make_cuDoubleComplex(1.0,0.0),
make_cuDoubleComplex(-1.0,0.0),
make_cuDoubleComplex(1.0,0.0),
make_cuDoubleComplex(-1.0,0.0),
make_cuDoubleComplex(1.0,0.0),
};
cuDoubleComplex det;
if (method == 0) {
det = direct_det5(matrix);
} else {
det = det_5x5(matrix);
}
return det;
}
__global__ void gpu_global_kernel(cuDoubleComplex *d_res, int method) {
// some other computations, irrelevant to current problem
uint64_t total = 69120 * 1000;// 10 * 1000 * 1000; // this `total` is what I am referring to thousands
cuDoubleComplex det_sum, det;
det_sum = make_cuDoubleComplex(0.0,0.0);
for (int i = 0; i < total; i++) {
det = gpu_device_kernel(i, method);
det_sum = cuCadd(det_sum, det);
}
det_sum = cuCmul(det_sum, make_cuDoubleComplex(1.0/total,0.0) );
d_res[0] = det_sum;
return;
}
int main(int argc, char *argv[]) {
if (argc != 2) {
printf("Usage: %s <num1> <num2>\n", argv[0]);
return 1;
}
int method;
method = atoi(argv[1]);
double complex *res;
res = (double complex*)malloc(sizeof(double complex));
memset(res, 0.0,sizeof(double complex));
cuDoubleComplex *d_res;
cudaMalloc(&d_res, sizeof(cuDoubleComplex) );
int *d_method;
cudaMalloc(&d_method, sizeof(int) );
cudaMemcpy(d_method, &method, sizeof(int), cudaMemcpyHostToDevice);
clock_t direct_start, direct_end;
direct_start = clock();
gpu_global_kernel <<<1,1>>>(d_res, *d_method);
cudaDeviceSynchronize();
direct_end = clock();
cudaMemcpy(res, d_res, sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost );
printf("RESULT: %.16E + %.16E i\n", creal(res[0]), cimag(res[0]));
double direct_time = (double)((double)(direct_end - direct_start) / CLOCKS_PER_SEC);
if (method == 0) {
printf("DIRECT EXEC: %.5f s \n", direct_time);
} else {
printf("COFACTOR EXEC: %.5f s \n", direct_time);
}
cudaDeviceReset();
return 0;
}
However, I am wondering if these implementation is even efficient.
I am looking for an alternative method to compute the determinant of a general matrix inside a device function. The example I have provided is one of the most straightforward methods, but an alternative approach, such as LU/Cholesky decomposition, that can be called from the device function is what I am seeking. Unfortunately, packages like LAPACK are not available for CUDA, but maybe a similar compatible function from some other packages could work within the CUDA device environment.
Any suggestions or function examples would be greatly appreciated!
Since you are including a C++ header
(<cstring>)I am assuming its a C++ code, even though it looks very C-like.I took the general generated code that you provided for expansion and implemented it directly as is by simply defining the operators
*,+,-for thecuDoubleComplex.I call this method "generated" in the code below. Additionally, I made some slight modifications to your code. I added a program parameter to specify the number of GPU threads to have a proper GPU utilization. I also use separate kernels for each method. Method 0 is the original method, method 1 is your direct method, method 2 is my "generated" method.
I compiled the code with
nvcc -std=c++14 -O3 -arch=sm_70 main.cu -o mainand see the following timings on V100 Pcie for 32000 GPU threads.The code: