I run a model in triton serving with shared memory and it works correctly. In order to simulate backend structure I wrote a Fast API for my model and run it with gunicorn with 6 workers. Then I wrote anthor Fast API to route locust requests to my first Fast Fast API as below image(pseudo code). my second Fast API runs with uvicorn. but the problem is when I used multiple workers for my uvicorn, triton serving failed to shared memory.

enter image description here

Note: without shared memory every thing works but my response time is much longer than the shared memory option. so I need to use shared memory option.

here is my triton client code: I have a functions in my client code named predict function which used the requestGenerator to shared input_simple and output_simple spaces.

this is my requestGenerator generator:

    def requestGenerator(self, triton_client, batched_img_data, input_name, output_name, dtype, batch_data):

        triton_client.unregister_system_shared_memory()
        triton_client.unregister_cuda_shared_memory()
        output_simple = "output_simple"
        input_simple = "input_simple"

        input_data = np.ones(
            shape=(batch_data, 3, self.width, self.height), dtype=np.float32)


        input_byte_size = input_data.size * input_data.itemsize
        output_byte_size = input_byte_size * 2


        shm_op0_handle = shm.create_shared_memory_region(
            output_name, output_simple, output_byte_size)

        triton_client.register_system_shared_memory(
            output_name, output_simple, output_byte_size)


        shm_ip0_handle = shm.create_shared_memory_region(
            input_name, input_simple, input_byte_size)

        triton_client.register_system_shared_memory(
            input_name, input_simple, input_byte_size)


        inputs = []
        inputs.append(
            httpclient.InferInput(input_name, batched_img_data.shape, dtype))
        inputs[0].set_data_from_numpy(batched_img_data, binary_data=True)

        outputs = []
        outputs.append(
            httpclient.InferRequestedOutput(output_name,
                                            binary_data=True))

        inputs[-1].set_shared_memory(input_name, input_byte_size)
        outputs[-1].set_shared_memory(output_name, output_byte_size)

        yield inputs, outputs, shm_ip0_handle, shm_op0_handle

this is my predict function:

    def predict(self, triton_client, batched_data, input_layer, output_layer, dtype):
        responses = []
        results = None

        for inputs, outputs, shm_ip_handle, shm_op_handle in self.requestGenerator(
            triton_client, batched_data, input_layer, output_layer, type, 
            len(batched_data)):
            self.sent_count += 1
            shm.set_shared_memory_region(shm_ip_handle, [batched_data])
            responses.append(
                triton_client.infer(model_name=self.model_name,
                                    inputs=inputs,
                                    request_id=str(self.sent_count),
                                    model_version="",
                                    outputs=outputs))


            output_buffer = responses[0].get_output(output_layer)
            if output_buffer is not None:
                results = shm.get_contents_as_numpy(
                    shm_op_handle, triton_to_np_dtype(output_buffer['datatype']),
                        output_buffer['shape'])
            
            triton_client.unregister_system_shared_memory()
            triton_client.unregister_cuda_shared_memory()
            shm.destroy_shared_memory_region(shm_ip_handle)
            shm.destroy_shared_memory_region(shm_op_handle)

        return results

Any help would be appreciated to help me how to use multiple uvicorn workers to send multiple requests concurrently to my triton code without failing.

0

There are 0 answers