TensorRT is not using float16 (or how to check?)

842 views Asked by At

I have a strong suspicion that precision_mode='FP16' does nothing (tf 1.15). The size of .pb file does not change, but having read this question that weights might be still float32 while float16 is used for computation, I tried to check tensors.

Here we create keras model

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import backend as K
import numpy as np

from tensorflow.python.platform import gfile
from tensorflow.python.framework import graph_io



inp = keras.layers.Input(shape=(None,None,3))
x = keras.layers.Conv2D(64, 3, padding='same')(inp)
out = keras.layers.Conv2D(3, 3, padding='same')(x)
model = keras.Model([inp], [out])

model.compile(optimizer='adam', loss='mse')


input_name = model.inputs[0].name
output_name = model.outputs[0].name



print(input_name)
print(output_name)
'''
input_1:0
conv2d_1/BiasAdd:0
'''


# -------------------- SAVING

sess = K.get_session()

output_name = output_name.split(":")[0]

with sess.graph.as_default() as graph:

    input_graph_def = graph.as_graph_def()

    output_graph_def = tf.graph_util.convert_variables_to_constants(
        sess, # The session
        input_graph_def, # input_graph_def is useful for retrieving the nodes
        output_node_names = [output_name]) #[node.name for node in input_graph_def.node]  )

    #write the graph
    graph_io.write_graph(output_graph_def, '', 'model.pb', as_text=False)

Then convert it to tensorrt using precision_mode='FP16':

import tensorflow as tf
#from tensorflow.contrib import tensorrt as trt
from tensorflow.python.compiler.tensorrt import trt_convert as trt

tf.flags.DEFINE_bool('use_float16', True, 'Whether we want to quantize it to float16.')

output_names = ['conv2d_1/BiasAdd']

def load_graph(file):
   with tf.gfile.GFile(file, 'rb') as f:
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
      tf.import_graph_def(graph_def)
      return graph, graph_def


graph, graph_def = load_graph('model.pb')
tensorrt_graph = trt.create_inference_graph(graph_def, outputs=output_names, max_batch_size=1, precision_mode='FP16')
with tf.gfile.GFile('trt_model.pb', 'wb') as f:
    f.write(tensorrt_graph.SerializeToString())

Conversion log:

2020-10-21 15:54:14.659757: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3693 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1)
2020-10-21 15:54:14.661494: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x562666640c80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-10-21 15:54:14.661507: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): GeForce GTX 1050, Compute Capability 6.1
2020-10-21 15:54:14.669536: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:786] Optimization results for grappler item: tf_graph
2020-10-21 15:54:14.669560: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:788]   constant_folding: Graph size after: 9 nodes (-4), 8 edges (-4), time = 1.469ms.
2020-10-21 15:54:14.669569: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:788]   layout: Graph size after: 13 nodes (4), 12 edges (4), time = 0.588ms.
2020-10-21 15:54:14.669575: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:788]   constant_folding: Graph size after: 13 nodes (0), 12 edges (0), time = 1.32ms.
2020-10-21 15:54:14.669582: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:788]   constant_folding: Graph size after: 13 nodes (0), 12 edges (0), time = 0.784ms.

And load it, printing the types of tensors

from PIL import Image
import numpy as np

import tensorflow as tf

from tensorflow.core.framework import types_pb2, graph_pb2, attr_value_pb2
from tensorflow.tools.graph_transforms import TransformGraph
from google.protobuf import text_format

#tf.flags.DEFINE_bool('use_float16', True, 'Whether we want to quantize it to float16.')


def load_graph(model_path):
    graph = tf.Graph()
    with graph.as_default():
        graph_def = tf.GraphDef()
        if model_path.endswith("pb"):
            with open(model_path, "rb") as f:
                graph_def.ParseFromString(f.read())
        else:
            with open(model_path, "r") as pf:
                text_format.Parse(pf.read(), graph_def)
        tf.import_graph_def(graph_def, name="")
        sess = tf.Session(graph=graph)
        return sess, graph


sess,graph = load_graph('trt_model.pb')

input_name = 'input_1:0'
output_name = 'conv2d_1/BiasAdd:0'

print('---------------Done---------------')

#Test model



test_img_orig = Image.open('test.jpg').convert('RGB')
test_img_orig = (np.array(test_img_orig)/255.).astype(np.float16)

print(sess.graph.get_tensor_by_name(input_name))
print(sess.graph.get_tensor_by_name(output_name))


output_tensor = sess.graph.get_tensor_by_name(output_name)
output = sess.run(output_tensor, {input_name: test_img_orig[np.newaxis, ...]})

print(sess.graph.get_tensor_by_name(input_name))
print(sess.graph.get_tensor_by_name(output_name))

The result is

Tensor("input_1:0", shape=(?, ?, ?, 3), dtype=float32)
Tensor("conv2d_1/BiasAdd:0", shape=(?, ?, ?, 3), dtype=float32)

Which means the model is float32. How can I quantize my model for float16 using tensorrt?

0

There are 0 answers