I run a model in triton serving with shared memory and it works correctly. In order to simulate backend structure I wrote a Fast API for my model and run it with gunicorn with 6 workers. Then I wrote anthor Fast API to route locust requests to my first Fast Fast API as below image(pseudo code). my second Fast API runs with uvicorn. but the problem is when I used multiple workers for my uvicorn, triton serving failed to shared memory.
Note: without shared memory every thing works but my response time is much longer than the shared memory option. so I need to use shared memory option.
here is my triton client code:
I have a functions in my client code named predict
function which used the requestGenerator
to shared input_simple
and output_simple
spaces.
this is my requestGenerator
generator:
def requestGenerator(self, triton_client, batched_img_data, input_name, output_name, dtype, batch_data):
triton_client.unregister_system_shared_memory()
triton_client.unregister_cuda_shared_memory()
output_simple = "output_simple"
input_simple = "input_simple"
input_data = np.ones(
shape=(batch_data, 3, self.width, self.height), dtype=np.float32)
input_byte_size = input_data.size * input_data.itemsize
output_byte_size = input_byte_size * 2
shm_op0_handle = shm.create_shared_memory_region(
output_name, output_simple, output_byte_size)
triton_client.register_system_shared_memory(
output_name, output_simple, output_byte_size)
shm_ip0_handle = shm.create_shared_memory_region(
input_name, input_simple, input_byte_size)
triton_client.register_system_shared_memory(
input_name, input_simple, input_byte_size)
inputs = []
inputs.append(
httpclient.InferInput(input_name, batched_img_data.shape, dtype))
inputs[0].set_data_from_numpy(batched_img_data, binary_data=True)
outputs = []
outputs.append(
httpclient.InferRequestedOutput(output_name,
binary_data=True))
inputs[-1].set_shared_memory(input_name, input_byte_size)
outputs[-1].set_shared_memory(output_name, output_byte_size)
yield inputs, outputs, shm_ip0_handle, shm_op0_handle
this is my predict
function:
def predict(self, triton_client, batched_data, input_layer, output_layer, dtype):
responses = []
results = None
for inputs, outputs, shm_ip_handle, shm_op_handle in self.requestGenerator(
triton_client, batched_data, input_layer, output_layer, type,
len(batched_data)):
self.sent_count += 1
shm.set_shared_memory_region(shm_ip_handle, [batched_data])
responses.append(
triton_client.infer(model_name=self.model_name,
inputs=inputs,
request_id=str(self.sent_count),
model_version="",
outputs=outputs))
output_buffer = responses[0].get_output(output_layer)
if output_buffer is not None:
results = shm.get_contents_as_numpy(
shm_op_handle, triton_to_np_dtype(output_buffer['datatype']),
output_buffer['shape'])
triton_client.unregister_system_shared_memory()
triton_client.unregister_cuda_shared_memory()
shm.destroy_shared_memory_region(shm_ip_handle)
shm.destroy_shared_memory_region(shm_op_handle)
return results
Any help would be appreciated to help me how to use multiple uvicorn workers to send multiple requests concurrently to my triton code without failing.