Intercepting cuda memory management related APIs in cuda11.7 and encountering invalid device context (201) errors

106 views Asked by At

I implemented an interception library that intercepts the driver API cuMemAlloc() and cuGetProcAddress() and then forwards it. Then I set LD_LIBRARY_PATH to the path of the interception library and run a matrix multiplication program written with the runtime API. When the program calls cudaMalloc (), the driver layer API cuGetProcAddress() will be called internally to obtain the function address of cuMemAlloc() to call cuMemAlloc(), so I intercepted cuGetProcAddress and changed the cuMemAlloc function address it obtained to the cuMemAlloc function address of my interception library, the function can be intercepted at this time, but a 201 error is returned after calling the original library function through dlsym. How can I solve this problem? Is my interception process correct? My code is as follows

  • hook library
#include <stdio.h>
#include <dlfcn.h>
#include <string.h>
#include <stdint.h>

typedef enum cudaError_enum {
    CUDA_SUCCESS                              = 0,
    //...
    //...Not copied completely from my code
    //...
    CUDA_ERROR_UNKNOWN                        = 999
} CUresult;

typedef unsigned long long CUdeviceptr_v2;
typedef CUdeviceptr_v2 CUdeviceptr;
typedef uint64_t cuuint64_t;
char *cuda_filename = "libcuda.so.515.65.01";


CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize){
        printf("hijacking cuMemAlloc!\n");
        CUresult (*hello)(CUdeviceptr *, size_t);
        CUresult ret;
        void *table = NULL;

        table = dlopen(cuda_filename, RTLD_NOW | RTLD_NODELETE);
        if (!table) {
                printf("Error can't find library %s", cuda_filename);
        }
        hello = (CUresult (*)(CUdeviceptr *, size_t))dlsym(table, "cuMemAlloc");
        if (!hello){
                printf("can't find function cuMemAlloc");
        }
        ret = hello(dptr, bytesize);


        return ret;
}


CUresult cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags){
        //printf("hijacking cuGetProcAddress!\n");
        CUresult (*hello)(const char *, void **, int, cuuint64_t);
        CUresult ret;
        void *table = NULL;

        table = dlopen(cuda_filename, RTLD_NOW | RTLD_NODELETE);
        if (!table) {
                printf("Error can't find library %s", cuda_filename);
        }
        hello = (CUresult (*)(const char *, void **, int, cuuint64_t))dlsym(table, "cuGetProcAddress");
        if (!hello){
                printf("can't find function cuGetProcAddress");
        }
        ret = hello(symbol, pfn, cudaVersion, flags);
        if (!strcmp(symbol, "cuGetProcAddress"))
                *pfn = cuGetProcAddress;
        if (!strcmp(symbol, "cuMemAlloc"))
                *pfn = cuMemAlloc;

        return ret;
}

  • compile and export
gcc hook.c -fPIC -shared -ldl -o libcuda.so.1
export LD_LIBRARY_PATH=$PWD
  • runtime application
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <sys/time.h>
#include <stdio.h>
#include <math.h>

const int Row=2048;
const int Col=2048;

__global__
void matrix_mul_gpu(int *M, int* N, int* P, int width)
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    int j = threadIdx.y + blockDim.y * blockIdx.y;

    int sum = 0;
    for(int k=0;k<width;k++)
    {
        int a = M[j*width+k];
        int b = N[k*width+i];
        sum += a*b;
    }
    P[j*width+i] = sum;
}

int main()
{
            cudaError_t cuda_err = cudaSuccess;
            printf("func start \n");
            int *A = (int *)malloc(sizeof(int) * Row * Col);
            int *B = (int *)malloc(sizeof(int) * Row * Col);
            int *C = (int *)malloc(sizeof(int) * Row * Col);
            //malloc device memory
            int *d_dataA, *d_dataB, *d_dataC;
            printf("before cudaMalloc()\n");
            cuda_err = cudaMalloc((void**)&d_dataA, sizeof(int) *Row*Col);
            //cuda_err = cudaGetLastError();
            if (cudaSuccess != cuda_err)
            {
                    fprintf(stderr, "(%s:%s:%d)",  __FILE__, __FUNCTION__, __LINE__);
                    fprintf(stderr, "%s\n", cudaGetErrorString(cuda_err));
                    printf("cuda_err is %d\n", cuda_err);
                    exit(1);
            }
            printf("after cudaMalloc()\n");
            cudaMalloc((void**)&d_dataB, sizeof(int) *Row*Col);
            cudaMalloc((void**)&d_dataC, sizeof(int) *Row*Col);
            //set value
            for (int i = 0; i < Row*Col; i++) {
                A[i] = 90;
                B[i] = 10;
            }

            cudaMemcpy(d_dataA, A, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
            cudaMemcpy(d_dataB, B, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
            dim3 threadPerBlock(16, 16);
            dim3 blockNumber((Col+threadPerBlock.x-1)/ threadPerBlock.x, (Row+threadPerBlock.y-1)/ threadPerBlock.y );
            matrix_mul_gpu << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);
            cudaDeviceSynchronize();
            cudaMemcpy(C, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
            free(A);
            free(B);
            free(C);
            cudaFree(d_dataA);
            cudaFree(d_dataB);
            cudaFree(d_dataC);

    return 0;
}

  • error output
nvcc matri.cu -o matri.out
./matri.out

func start
before cudaMalloc()
hijacking cuMemAlloc!
(matri.cu:main:40)invalid device context
cuda_err is 201
0

There are 0 answers