I implemented an interception library that intercepts the driver API cuMemAlloc()
and cuGetProcAddress()
and then forwards it. Then I set LD_LIBRARY_PATH
to the path of the interception library and run a matrix multiplication program written with the runtime API. When the program calls cudaMalloc ()
, the driver layer API cuGetProcAddress()
will be called internally to obtain the function address of cuMemAlloc()
to call cuMemAlloc()
, so I intercepted cuGetProcAddress and changed the cuMemAlloc function address it obtained to the cuMemAlloc
function address of my interception library, the function can be intercepted at this time, but a 201 error is returned after calling the original library function through dlsym
. How can I solve this problem? Is my interception process correct?
My code is as follows
- hook library
#include <stdio.h>
#include <dlfcn.h>
#include <string.h>
#include <stdint.h>
typedef enum cudaError_enum {
CUDA_SUCCESS = 0,
//...
//...Not copied completely from my code
//...
CUDA_ERROR_UNKNOWN = 999
} CUresult;
typedef unsigned long long CUdeviceptr_v2;
typedef CUdeviceptr_v2 CUdeviceptr;
typedef uint64_t cuuint64_t;
char *cuda_filename = "libcuda.so.515.65.01";
CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize){
printf("hijacking cuMemAlloc!\n");
CUresult (*hello)(CUdeviceptr *, size_t);
CUresult ret;
void *table = NULL;
table = dlopen(cuda_filename, RTLD_NOW | RTLD_NODELETE);
if (!table) {
printf("Error can't find library %s", cuda_filename);
}
hello = (CUresult (*)(CUdeviceptr *, size_t))dlsym(table, "cuMemAlloc");
if (!hello){
printf("can't find function cuMemAlloc");
}
ret = hello(dptr, bytesize);
return ret;
}
CUresult cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags){
//printf("hijacking cuGetProcAddress!\n");
CUresult (*hello)(const char *, void **, int, cuuint64_t);
CUresult ret;
void *table = NULL;
table = dlopen(cuda_filename, RTLD_NOW | RTLD_NODELETE);
if (!table) {
printf("Error can't find library %s", cuda_filename);
}
hello = (CUresult (*)(const char *, void **, int, cuuint64_t))dlsym(table, "cuGetProcAddress");
if (!hello){
printf("can't find function cuGetProcAddress");
}
ret = hello(symbol, pfn, cudaVersion, flags);
if (!strcmp(symbol, "cuGetProcAddress"))
*pfn = cuGetProcAddress;
if (!strcmp(symbol, "cuMemAlloc"))
*pfn = cuMemAlloc;
return ret;
}
- compile and export
gcc hook.c -fPIC -shared -ldl -o libcuda.so.1
export LD_LIBRARY_PATH=$PWD
- runtime application
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <sys/time.h>
#include <stdio.h>
#include <math.h>
const int Row=2048;
const int Col=2048;
__global__
void matrix_mul_gpu(int *M, int* N, int* P, int width)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int sum = 0;
for(int k=0;k<width;k++)
{
int a = M[j*width+k];
int b = N[k*width+i];
sum += a*b;
}
P[j*width+i] = sum;
}
int main()
{
cudaError_t cuda_err = cudaSuccess;
printf("func start \n");
int *A = (int *)malloc(sizeof(int) * Row * Col);
int *B = (int *)malloc(sizeof(int) * Row * Col);
int *C = (int *)malloc(sizeof(int) * Row * Col);
//malloc device memory
int *d_dataA, *d_dataB, *d_dataC;
printf("before cudaMalloc()\n");
cuda_err = cudaMalloc((void**)&d_dataA, sizeof(int) *Row*Col);
//cuda_err = cudaGetLastError();
if (cudaSuccess != cuda_err)
{
fprintf(stderr, "(%s:%s:%d)", __FILE__, __FUNCTION__, __LINE__);
fprintf(stderr, "%s\n", cudaGetErrorString(cuda_err));
printf("cuda_err is %d\n", cuda_err);
exit(1);
}
printf("after cudaMalloc()\n");
cudaMalloc((void**)&d_dataB, sizeof(int) *Row*Col);
cudaMalloc((void**)&d_dataC, sizeof(int) *Row*Col);
//set value
for (int i = 0; i < Row*Col; i++) {
A[i] = 90;
B[i] = 10;
}
cudaMemcpy(d_dataA, A, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataB, B, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
dim3 threadPerBlock(16, 16);
dim3 blockNumber((Col+threadPerBlock.x-1)/ threadPerBlock.x, (Row+threadPerBlock.y-1)/ threadPerBlock.y );
matrix_mul_gpu << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);
cudaDeviceSynchronize();
cudaMemcpy(C, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
free(A);
free(B);
free(C);
cudaFree(d_dataA);
cudaFree(d_dataB);
cudaFree(d_dataC);
return 0;
}
- error output
nvcc matri.cu -o matri.out
./matri.out
func start
before cudaMalloc()
hijacking cuMemAlloc!
(matri.cu:main:40)invalid device context
cuda_err is 201