Intercepting cuda memory management related APIs in cuda11.7 and encountering invalid device context (201) errors

Question

Intercepting cuda memory management related APIs in cuda11.7 and encountering invalid device context (201) errors

100 views Asked by yyyfish At 18 November 2023 at 04:44

I implemented an interception library that intercepts the driver API cuMemAlloc() and cuGetProcAddress() and then forwards it. Then I set LD_LIBRARY_PATH to the path of the interception library and run a matrix multiplication program written with the runtime API. When the program calls cudaMalloc (), the driver layer API cuGetProcAddress() will be called internally to obtain the function address of cuMemAlloc() to call cuMemAlloc(), so I intercepted cuGetProcAddress and changed the cuMemAlloc function address it obtained to the cuMemAlloc function address of my interception library, the function can be intercepted at this time, but a 201 error is returned after calling the original library function through dlsym. How can I solve this problem? Is my interception process correct? My code is as follows

hook library

#include <stdio.h>
#include <dlfcn.h>
#include <string.h>
#include <stdint.h>

typedef enum cudaError_enum {
    CUDA_SUCCESS                              = 0,
    //...
    //...Not copied completely from my code
    //...
    CUDA_ERROR_UNKNOWN                        = 999
} CUresult;

typedef unsigned long long CUdeviceptr_v2;
typedef CUdeviceptr_v2 CUdeviceptr;
typedef uint64_t cuuint64_t;
char *cuda_filename = "libcuda.so.515.65.01";


CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize){
        printf("hijacking cuMemAlloc!\n");
        CUresult (*hello)(CUdeviceptr *, size_t);
        CUresult ret;
        void *table = NULL;

        table = dlopen(cuda_filename, RTLD_NOW | RTLD_NODELETE);
        if (!table) {
                printf("Error can't find library %s", cuda_filename);
        }
        hello = (CUresult (*)(CUdeviceptr *, size_t))dlsym(table, "cuMemAlloc");
        if (!hello){
                printf("can't find function cuMemAlloc");
        }
        ret = hello(dptr, bytesize);


        return ret;
}


CUresult cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags){
        //printf("hijacking cuGetProcAddress!\n");
        CUresult (*hello)(const char *, void **, int, cuuint64_t);
        CUresult ret;
        void *table = NULL;

        table = dlopen(cuda_filename, RTLD_NOW | RTLD_NODELETE);
        if (!table) {
                printf("Error can't find library %s", cuda_filename);
        }
        hello = (CUresult (*)(const char *, void **, int, cuuint64_t))dlsym(table, "cuGetProcAddress");
        if (!hello){
                printf("can't find function cuGetProcAddress");
        }
        ret = hello(symbol, pfn, cudaVersion, flags);
        if (!strcmp(symbol, "cuGetProcAddress"))
                *pfn = cuGetProcAddress;
        if (!strcmp(symbol, "cuMemAlloc"))
                *pfn = cuMemAlloc;

        return ret;
}

compile and export

gcc hook.c -fPIC -shared -ldl -o libcuda.so.1
export LD_LIBRARY_PATH=$PWD

runtime application

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <sys/time.h>
#include <stdio.h>
#include <math.h>

const int Row=2048;
const int Col=2048;

__global__
void matrix_mul_gpu(int *M, int* N, int* P, int width)
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    int j = threadIdx.y + blockDim.y * blockIdx.y;

    int sum = 0;
    for(int k=0;k<width;k++)
    {
        int a = M[j*width+k];
        int b = N[k*width+i];
        sum += a*b;
    }
    P[j*width+i] = sum;
}

int main()
{
            cudaError_t cuda_err = cudaSuccess;
            printf("func start \n");
            int *A = (int *)malloc(sizeof(int) * Row * Col);
            int *B = (int *)malloc(sizeof(int) * Row * Col);
            int *C = (int *)malloc(sizeof(int) * Row * Col);
            //malloc device memory
            int *d_dataA, *d_dataB, *d_dataC;
            printf("before cudaMalloc()\n");
            cuda_err = cudaMalloc((void**)&d_dataA, sizeof(int) *Row*Col);
            //cuda_err = cudaGetLastError();
            if (cudaSuccess != cuda_err)
            {
                    fprintf(stderr, "(%s:%s:%d)",  __FILE__, __FUNCTION__, __LINE__);
                    fprintf(stderr, "%s\n", cudaGetErrorString(cuda_err));
                    printf("cuda_err is %d\n", cuda_err);
                    exit(1);
            }
            printf("after cudaMalloc()\n");
            cudaMalloc((void**)&d_dataB, sizeof(int) *Row*Col);
            cudaMalloc((void**)&d_dataC, sizeof(int) *Row*Col);
            //set value
            for (int i = 0; i < Row*Col; i++) {
                A[i] = 90;
                B[i] = 10;
            }

            cudaMemcpy(d_dataA, A, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
            cudaMemcpy(d_dataB, B, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
            dim3 threadPerBlock(16, 16);
            dim3 blockNumber((Col+threadPerBlock.x-1)/ threadPerBlock.x, (Row+threadPerBlock.y-1)/ threadPerBlock.y );
            matrix_mul_gpu << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);
            cudaDeviceSynchronize();
            cudaMemcpy(C, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
            free(A);
            free(B);
            free(C);
            cudaFree(d_dataA);
            cudaFree(d_dataB);
            cudaFree(d_dataC);

    return 0;
}

error output

nvcc matri.cu -o matri.out
./matri.out

func start
before cudaMalloc()
hijacking cuMemAlloc!
(matri.cu:main:40)invalid device context
cuda_err is 201

Original Q&A

TechQA.

Intercepting cuda memory management related APIs in cuda11.7 and encountering invalid device context (201) errors

There are 0 answers

Related Questions in CUDA

Related Questions in RUNTIME-ERROR

Related Questions in HOOK

Related Questions in CUDA-CONTEXT

Popular Questions

Popular Tags

Trending Questions