tensorrt inference problem: CPU memory leak

Question

tensorrt inference problem: CPU memory leak

14 views Asked by Yulong Liu At 29 March 2024 at 14:59

when i using tensorrt to inference my model, it seems that the cpu memory is leaked! I used and modified the official nvidia tensorrt code my common excute code is official nvidia tensorrt code,and i do a little modify.


class HostDeviceMem:
    """Pair of host and device memory, where the host memory is wrapped in a numpy array"""
    def __init__(self, size: int, dtype: np.dtype):
        nbytes = size * dtype.itemsize
        host_mem = cuda_call(cudart.cudaMallocHost(nbytes))
        pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))

        self._host = np.ctypeslib.as_array(ctypes.cast(host_mem, pointer_type), (size,))
        self._device = cuda_call(cudart.cudaMalloc(nbytes))
        self._nbytes = nbytes

    @property
    def host(self) -> np.ndarray:
        return self._host

    @host.setter
    def host(self, arr: np.ndarray):
        if arr.size > self.host.size:
            raise ValueError(
                f"Tried to fit an array of size {arr.size} into host memory of size {self.host.size}"
            )
        np.copyto(self.host[:arr.size], arr.flat, casting='safe')
    
    @property
    def nbytes(self) -> int:
        return self._nbytes
    
    @nbytes.setter
    def nbytes(self, nbytes: int):
        self._nbytes = nbytes

    @property
    def device(self) -> int:
        return self._device

    # @property
    # def nbytes(self) -> int:
    #     return self._nbytes

    def __str__(self):
        return f"Host:\n{self.host}\nDevice:\n{self.device}\nSize:\n{self.nbytes}\n"

    def __repr__(self):
        return self.__str__()

    def free(self):
        cuda_call(cudart.cudaFree(self.device))
        cuda_call(cudart.cudaFreeHost(self.host.ctypes.data))


# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
# If engine uses dynamic shapes, specify a profile to find the maximum input & output size.



# Frees the resources allocated in allocate_buffers
def free_buffers(inputs: List[HostDeviceMem], outputs: List[HostDeviceMem], stream: cudart.cudaStream_t):
    for mem in inputs + outputs:
        mem.free()
    cuda_call(cudart.cudaStreamDestroy(stream))


# Wrapper for cudaMemcpy which infers copy size and does error checking
def memcpy_host_to_device(device_ptr: int, host_arr: np.ndarray):
    nbytes = host_arr.size * host_arr.itemsize
    cuda_call(cudart.cudaMemcpy(device_ptr, host_arr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice))


# Wrapper for cudaMemcpy which infers copy size and does error checking
def memcpy_device_to_host(host_arr: np.ndarray, device_ptr: int):
    nbytes = host_arr.size * host_arr.itemsize
    cuda_call(cudart.cudaMemcpy(host_arr, device_ptr, nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost))


def _do_inference_base(inputs, outputs, stream, execute_async):
    # Transfer input data to the GPU.
    kind = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
    [cuda_call(cudart.cudaMemcpyAsync(inp.device, inp.host, inp.nbytes, kind, stream)) for inp in inputs]
    # Run inference.
    execute_async()
    # Transfer predictions back from the GPU.

    kind = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
    [cuda_call(cudart.cudaMemcpyAsync(out.host, out.device, out.nbytes, kind, stream)) for out in outputs]
    # Synchronize the stream
    cuda_call(cudart.cudaStreamSynchronize(stream))
    # Return only the host outputs.
    return [out.host for out in outputs]


# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    def execute_async():
        context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream)
    return _do_inference_base(inputs, outputs, stream, execute_async)


# This function is generalized for multiple inputs/outputs for full dimension networks.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference_v2(context, bindings, inputs, outputs, stream):
    def execute_async():
        context.execute_async_v2(bindings=bindings, stream_handle=stream)
    return _do_inference_base(inputs, outputs, stream, execute_async)

my inference code is here, the input is a img with nchw shape. the model inputs num is 1,outputs num is 2

import common
import os
import sys
from common import cuda_call, HostDeviceMem
import ctypes

import numpy as np
import tensorrt as trt
from cuda import cudart

class TensorRTInfer:
    """
    Implements inference for the TensorRT engine.
    """

    def __init__(self, engine_path):
        """
        :param engine_path: The path to the serialized engine to load from disk.
        """
        # Load TRT engine
        print("Loading engine from file {}".format(engine_path))
        self.logger = trt.Logger(trt.Logger.ERROR)
        trt.init_libnvinfer_plugins(self.logger, namespace="")
        with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:
            assert runtime
            self.engine = runtime.deserialize_cuda_engine(f.read())
        assert self.engine
        self.context = self.engine.create_execution_context()
        assert self.context
        self.context.active_optimization_profile = 0
    
        self.inputs, self.outputs, self.bindings, self.stream = \
            self.allocate_buffers()
        
    
    def infer(self, batch):
        rt_input_shape = batch.shape
        rt_size = trt.volume(rt_input_shape)
        rt_nbytes = rt_size * batch.dtype.itemsize
        
        batch = np.ascontiguousarray(batch)

        self.inputs[0].host = batch
        self.inputs[0].nbytes = rt_nbytes

        tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
        output_tensor_names = [itm for itm in tensor_names if self.engine.get_tensor_mode(itm) == trt.TensorIOMode.OUTPUT]
        input_tensor_names = [itm for itm in tensor_names if self.engine.get_tensor_mode(itm) == trt.TensorIOMode.INPUT]

        self.context.set_input_shape(input_tensor_names[0], rt_input_shape)
    
        ## set output shape
        rt_output_shapes = []
        
        
        for i, binding in enumerate(output_tensor_names):
            rt_output_shape = self.context.get_tensor_shape(binding)
            rt_output_shapes.append(rt_output_shape)
            dtype = np.dtype(trt.nptype(self.engine.get_tensor_dtype(binding)))
            self.outputs[i].nbytes = trt.volume(rt_output_shape) * dtype.itemsize
        
        trt_outs = common.do_inference_v2(self.context, bindings=self.bindings, 
                               inputs=self.inputs, outputs=self.outputs, stream=self.stream)
        # trt_outs = [out.reshape(shape) for out, shape in zip(trt_outs, rt_output_shapes)]
        size_0 = trt.volume(rt_output_shapes[0])
        trt_out = trt_outs[0][:size_0].reshape(rt_output_shapes[0])

        return trt_out
    
        # for rt_out_shape in rt_output_shapes:
        #     print("output shape: ", rt_out_shape)
        #     self.outputs[0].nbytes = trt.volume(rt_out_shape) * trt.nptype(self.engine.get_tensor_dtype(tensor_names[1])).itemsize
        
    def allocate_buffers(self):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda_call(cudart.cudaStreamCreate())
        tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
        output_tensor_names = [itm for itm in tensor_names if self.engine.get_tensor_mode(itm) == trt.TensorIOMode.OUTPUT]
        input_tensor_names = [itm for itm in tensor_names if self.engine.get_tensor_mode(itm) == trt.TensorIOMode.INPUT]

        max_profile_shape = self.engine.get_tensor_profile_shape(input_tensor_names[0], 0)[-1]
        self.context.set_input_shape(input_tensor_names[0], max_profile_shape)
        for binding in input_tensor_names:
            size = trt.volume(max_profile_shape)
            dtype = np.dtype(trt.nptype(self.engine.get_tensor_dtype(binding)))
            bindingMemory = HostDeviceMem(size, dtype)
            bindings.append(int(bindingMemory.device))
            inputs.append(bindingMemory)
        
        for binding in output_tensor_names:
            output_shape = self.context.get_tensor_shape(binding)
            size = trt.volume(output_shape)
            dtype = np.dtype(trt.nptype(self.engine.get_tensor_dtype(binding)))
            bindingMemory = HostDeviceMem(size, dtype)
            bindings.append(int(bindingMemory.device))
            outputs.append(bindingMemory)
        return inputs, outputs, bindings, stream

tthis question has been bugging me for days, I would feel so much better if someone could give me a little boost.

i try to debug whole pipeline, and locate the problem in this block.but i cannot solve it

Original Q&A

TechQA.

tensorrt inference problem: CPU memory leak

There are 0 answers

Related Questions in MEMORY

Related Questions in TENSORRT

Popular Questions

Trending Questions