I have a strong suspicion that precision_mode='FP16'
does nothing (tf 1.15). The size of .pb file does not change, but having read this question that weights might be still float32 while float16 is used for computation, I tried to check tensors.
Here we create keras model
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import backend as K
import numpy as np
from tensorflow.python.platform import gfile
from tensorflow.python.framework import graph_io
inp = keras.layers.Input(shape=(None,None,3))
x = keras.layers.Conv2D(64, 3, padding='same')(inp)
out = keras.layers.Conv2D(3, 3, padding='same')(x)
model = keras.Model([inp], [out])
model.compile(optimizer='adam', loss='mse')
input_name = model.inputs[0].name
output_name = model.outputs[0].name
print(input_name)
print(output_name)
'''
input_1:0
conv2d_1/BiasAdd:0
'''
# -------------------- SAVING
sess = K.get_session()
output_name = output_name.split(":")[0]
with sess.graph.as_default() as graph:
input_graph_def = graph.as_graph_def()
output_graph_def = tf.graph_util.convert_variables_to_constants(
sess, # The session
input_graph_def, # input_graph_def is useful for retrieving the nodes
output_node_names = [output_name]) #[node.name for node in input_graph_def.node] )
#write the graph
graph_io.write_graph(output_graph_def, '', 'model.pb', as_text=False)
Then convert it to tensorrt using precision_mode='FP16'
:
import tensorflow as tf
#from tensorflow.contrib import tensorrt as trt
from tensorflow.python.compiler.tensorrt import trt_convert as trt
tf.flags.DEFINE_bool('use_float16', True, 'Whether we want to quantize it to float16.')
output_names = ['conv2d_1/BiasAdd']
def load_graph(file):
with tf.gfile.GFile(file, 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def)
return graph, graph_def
graph, graph_def = load_graph('model.pb')
tensorrt_graph = trt.create_inference_graph(graph_def, outputs=output_names, max_batch_size=1, precision_mode='FP16')
with tf.gfile.GFile('trt_model.pb', 'wb') as f:
f.write(tensorrt_graph.SerializeToString())
Conversion log:
2020-10-21 15:54:14.659757: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1304] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3693 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1)
2020-10-21 15:54:14.661494: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x562666640c80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-10-21 15:54:14.661507: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): GeForce GTX 1050, Compute Capability 6.1
2020-10-21 15:54:14.669536: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:786] Optimization results for grappler item: tf_graph
2020-10-21 15:54:14.669560: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:788] constant_folding: Graph size after: 9 nodes (-4), 8 edges (-4), time = 1.469ms.
2020-10-21 15:54:14.669569: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:788] layout: Graph size after: 13 nodes (4), 12 edges (4), time = 0.588ms.
2020-10-21 15:54:14.669575: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:788] constant_folding: Graph size after: 13 nodes (0), 12 edges (0), time = 1.32ms.
2020-10-21 15:54:14.669582: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:788] constant_folding: Graph size after: 13 nodes (0), 12 edges (0), time = 0.784ms.
And load it, printing the types of tensors
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow.core.framework import types_pb2, graph_pb2, attr_value_pb2
from tensorflow.tools.graph_transforms import TransformGraph
from google.protobuf import text_format
#tf.flags.DEFINE_bool('use_float16', True, 'Whether we want to quantize it to float16.')
def load_graph(model_path):
graph = tf.Graph()
with graph.as_default():
graph_def = tf.GraphDef()
if model_path.endswith("pb"):
with open(model_path, "rb") as f:
graph_def.ParseFromString(f.read())
else:
with open(model_path, "r") as pf:
text_format.Parse(pf.read(), graph_def)
tf.import_graph_def(graph_def, name="")
sess = tf.Session(graph=graph)
return sess, graph
sess,graph = load_graph('trt_model.pb')
input_name = 'input_1:0'
output_name = 'conv2d_1/BiasAdd:0'
print('---------------Done---------------')
#Test model
test_img_orig = Image.open('test.jpg').convert('RGB')
test_img_orig = (np.array(test_img_orig)/255.).astype(np.float16)
print(sess.graph.get_tensor_by_name(input_name))
print(sess.graph.get_tensor_by_name(output_name))
output_tensor = sess.graph.get_tensor_by_name(output_name)
output = sess.run(output_tensor, {input_name: test_img_orig[np.newaxis, ...]})
print(sess.graph.get_tensor_by_name(input_name))
print(sess.graph.get_tensor_by_name(output_name))
The result is
Tensor("input_1:0", shape=(?, ?, ?, 3), dtype=float32)
Tensor("conv2d_1/BiasAdd:0", shape=(?, ?, ?, 3), dtype=float32)
Which means the model is float32. How can I quantize my model for float16 using tensorrt?