Add inference_input_type and inference_output_type flags in TF 2.x TFLiteConverter (backward compatible with TF 1.x) to support integer (tf.int8, tf.uint8) input and output types in post training full integer quantized models.

PiperOrigin-RevId: 313668965
Change-Id: Iea684507f58651b34dada0285b00a82e80066aab
This commit is contained in:
A. Unique TensorFlower 2020-05-28 15:15:50 -07:00 committed by TensorFlower Gardener
parent 60c828a70e
commit 7d605fb0e2
2 changed files with 32 additions and 217 deletions

View File

@ -201,11 +201,6 @@ class QuantizationMode(object):
self._representative_dataset is not None and self._representative_dataset is not None and
self._smallest_supported_type() == constants.INT8) self._smallest_supported_type() == constants.INT8)
def is_post_training_integer_quantize(self):
"""Post training integer quantization."""
return (self.post_training_int8_no_float() or
self.post_training_int8_allow_float())
def training_time_int8_allow_float(self): def training_time_int8_allow_float(self):
"""Training-time int8 quantize, allow float fallback.""" """Training-time int8 quantize, allow float fallback."""
return (self._any_optimization_enabled() and return (self._any_optimization_enabled() and
@ -418,56 +413,7 @@ class TFLiteConverterBase(object):
class TFLiteConverterBaseV2(TFLiteConverterBase): class TFLiteConverterBaseV2(TFLiteConverterBase):
"""Converter subclass to share functionality between V2 converters. """Converter subclass to share functionality between V2 converters."""
Attributes:
allow_custom_ops: Boolean indicating whether to allow custom operations.
When False, any unknown operation is an error. When True, custom ops are
created for any op that is unknown. The developer needs to provide these
to the TensorFlow Lite runtime with a custom resolver. (default False)
optimizations: Experimental flag, subject to change. A list of optimizations
to apply when converting the model. E.g. `[Optimize.DEFAULT]`
representative_dataset: A representative dataset that can be used to
generate input and output samples for the model. The converter can use the
dataset to evaluate different optimizations. Note that this is an optional
attribute but it is necessary if INT8 is the only support builtin ops in
target ops.
target_spec: Experimental flag, subject to change. Specification of target
device.
inference_input_type: Data type of the input layer. Note that integer types
(tf.int8 and tf.uint8) are currently only supported for post training
integer quantization. (default tf.float32, must be in {tf.float32,
tf.int8, tf.uint8})
inference_output_type: Data type of the output layer. Note that integer
types (tf.int8 and tf.uint8) are currently only supported for post
training integer quantization. (default tf.float32, must be in
{tf.float32, tf.int8, tf.uint8})
experimental_new_converter: Experimental flag, subject to change. Enables
MLIR-based conversion instead of TOCO conversion.
"""
def __init__(self):
"""Constructor for TFLiteConverter."""
super(TFLiteConverterBaseV2, self).__init__()
self.inference_input_type = constants.FLOAT
self.inference_output_type = constants.FLOAT
def _validate_inference_input_output_types(self, quant_mode):
"""Validate inference_input_type and inference_output_type flags."""
default_types = [constants.FLOAT, None]
# We only support integer types for post training integer quantization
# as we have statistical information to quantize the input and output.
if quant_mode.is_post_training_integer_quantize():
all_types = default_types + [constants.INT8, constants.QUANTIZED_UINT8]
if self.inference_input_type not in all_types or \
self.inference_output_type not in all_types:
all_types_names = ["tf." + t.name for t in all_types]
raise ValueError("The inference_input_type and inference_output_type "
"must be in {}.".format(all_types_names))
elif self.inference_input_type not in default_types or \
self.inference_output_type not in default_types:
raise ValueError("The inference_input_type and inference_output_type "
"must be tf.float32.")
def convert(self, graph_def, input_tensors, output_tensors): def convert(self, graph_def, input_tensors, output_tensors):
"""Converts a TensorFlow GraphDef based on instance variables. """Converts a TensorFlow GraphDef based on instance variables.
@ -491,8 +437,6 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
quant_mode = QuantizationMode(self.optimizations, self.target_spec, quant_mode = QuantizationMode(self.optimizations, self.target_spec,
self.representative_dataset, graph_def) self.representative_dataset, graph_def)
self._validate_inference_input_output_types(quant_mode)
if not self._is_unknown_shapes_allowed(): if not self._is_unknown_shapes_allowed():
# Checks dimensions in input tensor. # Checks dimensions in input tensor.
for tensor in input_tensors: for tensor in input_tensors:
@ -535,9 +479,6 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
"quantize_to_float16": True, "quantize_to_float16": True,
}) })
# Converter requires that the inference_input_type flag is set to FLOAT
converter_kwargs.update({"inference_input_type": constants.FLOAT})
if not self.experimental_new_converter: if not self.experimental_new_converter:
logging.warning( logging.warning(
"Please consider switching to use new converter by setting " "Please consider switching to use new converter by setting "
@ -557,11 +498,11 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
**converter_kwargs) **converter_kwargs)
if quant_mode.post_training_int8_no_float(): if quant_mode.post_training_int8_no_float():
result = self._calibrate_quantize_model(result, self.inference_input_type, result = self._calibrate_quantize_model(result, constants.FLOAT,
self.inference_output_type, False) constants.FLOAT, False)
elif quant_mode.post_training_int8_allow_float(): elif quant_mode.post_training_int8_allow_float():
result = self._calibrate_quantize_model(result, self.inference_input_type, result = self._calibrate_quantize_model(result, constants.FLOAT,
self.inference_output_type, True) constants.FLOAT, True)
if self._experimental_sparsify_model: if self._experimental_sparsify_model:
result = _mlir_sparsify(result) result = _mlir_sparsify(result)
@ -817,9 +758,12 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
Attributes: Attributes:
allow_custom_ops: Boolean indicating whether to allow custom operations. allow_custom_ops: Boolean indicating whether to allow custom operations.
When False, any unknown operation is an error. When True, custom ops are When false any unknown operation is an error. When true, custom ops are
created for any op that is unknown. The developer needs to provide these created for any op that is unknown. The developer will need to provide
to the TensorFlow Lite runtime with a custom resolver. (default False) these to the TensorFlow Lite runtime with a custom resolver.
(default False)
target_spec: Experimental flag, subject to change. Specification of target
device.
optimizations: Experimental flag, subject to change. A list of optimizations optimizations: Experimental flag, subject to change. A list of optimizations
to apply when converting the model. E.g. `[Optimize.DEFAULT]` to apply when converting the model. E.g. `[Optimize.DEFAULT]`
representative_dataset: A representative dataset that can be used to representative_dataset: A representative dataset that can be used to
@ -827,19 +771,8 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
dataset to evaluate different optimizations. Note that this is an optional dataset to evaluate different optimizations. Note that this is an optional
attribute but it is necessary if INT8 is the only support builtin ops in attribute but it is necessary if INT8 is the only support builtin ops in
target ops. target ops.
target_spec: Experimental flag, subject to change. Specification of target experimental_new_converter: Experimental flag, subject to change.
device. Enables MLIR-based conversion instead of TOCO conversion.
inference_input_type: Data type of the input layer. Note that integer types
(tf.int8 and tf.uint8) are currently only supported for post training
integer quantization. (default tf.float32, must be in {tf.float32,
tf.int8, tf.uint8})
inference_output_type: Data type of the output layer. Note that integer
types (tf.int8 and tf.uint8) are currently only supported for post
training integer quantization. (default tf.float32, must be in
{tf.float32, tf.int8, tf.uint8})
experimental_new_converter: Experimental flag, subject to change. Enables
MLIR-based conversion instead of TOCO conversion.
Example usage: Example usage:
```python ```python

View File

@ -71,27 +71,6 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
actual_value = self._evaluateTFLiteModel(tflite_model, [input_data]) actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
self.assertEqual(expected_value.numpy(), actual_value) self.assertEqual(expected_value.numpy(), actual_value)
@parameterized.named_parameters(
('_INT8InputOutput', lite.constants.INT8),
('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
@test_util.run_v2_only
def testInvalidFloat(self, inference_input_output_type):
root = self._getSimpleVariableModel()
input_data = tf.constant(1., shape=[1])
concrete_func = root.f.get_concrete_function(input_data)
# Convert model.
converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
# We don't support integer types as we don't have statistical information
# to quantize (only supported for post training integer quantization).
with self.assertRaises(ValueError) as error:
converter.inference_input_type = inference_input_output_type
converter.inference_output_type = inference_input_output_type
converter.convert()
self.assertEqual(
'The inference_input_type and inference_output_type '
'must be tf.float32.', str(error.exception))
@test_util.run_v2_only @test_util.run_v2_only
def testScalarInput(self): def testScalarInput(self):
root = self._getSimpleVariableModel() root = self._getSimpleVariableModel()
@ -193,113 +172,39 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
self.assertLess(len(quantized_tflite), len(float_tflite)) self.assertLess(len(quantized_tflite), len(float_tflite))
@parameterized.named_parameters( @parameterized.named_parameters(
('_INT8InputOutput', lite.constants.INT8), ('EnableMlirQuantizer', True), # enable mlir quantizer
('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8)) ('DisableMlirQuantizer', False)) # disable mlir quantizer
@test_util.run_v2_only def testCalibrateAndQuantizeBuiltinInt8(self, mlir_quantizer):
def testInvalidPostTrainingDynamicRangeQuantization(
self, inference_input_output_type):
func, _ = self._getCalibrationQuantizeModel()
# Convert float model.
converter = lite.TFLiteConverterV2.from_concrete_functions([func])
tflite_model = converter.convert()
self.assertTrue(tflite_model)
# Convert quantized model.
quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
quantized_converter.optimizations = [lite.Optimize.DEFAULT]
# We don't support integer types as we don't have statistical information
# to quantize (only supported for post training integer quantization).
with self.assertRaises(ValueError) as error:
quantized_converter.inference_input_type = inference_input_output_type
quantized_converter.inference_output_type = inference_input_output_type
quantized_converter.convert()
self.assertEqual(
'The inference_input_type and inference_output_type '
'must be tf.float32.', str(error.exception))
@parameterized.named_parameters(
('_DefaultFLOAT32InputOutput', lite.constants.FLOAT),
('_INT8InputOutput', lite.constants.INT8),
('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
def testPostTrainingIntegerAllowFloatQuantization(
self, inference_input_output_type):
func, calibration_gen = self._getCalibrationQuantizeModel() func, calibration_gen = self._getCalibrationQuantizeModel()
# Convert float model. # Convert float model.
converter = lite.TFLiteConverterV2.from_concrete_functions([func]) float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
tflite_model = converter.convert() float_tflite = float_converter.convert()
self.assertTrue(tflite_model) self.assertTrue(float_tflite)
# Convert quantized model.
quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
quantized_converter.optimizations = [lite.Optimize.DEFAULT]
quantized_converter.representative_dataset = calibration_gen
quantized_converter.inference_input_type = inference_input_output_type
quantized_converter.inference_output_type = inference_input_output_type
quantized_tflite_model = quantized_converter.convert()
self.assertTrue(quantized_tflite_model)
interpreter = Interpreter(model_content=quantized_tflite_model)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
self.assertLen(input_details, 1)
self.assertEqual(inference_input_output_type.as_numpy_dtype,
input_details[0]['dtype'])
output_details = interpreter.get_output_details()
self.assertLen(output_details, 1)
self.assertEqual(inference_input_output_type.as_numpy_dtype,
output_details[0]['dtype'])
# Ensure that the quantized tflite model is smaller.
self.assertLess(len(quantized_tflite_model), len(tflite_model))
@parameterized.named_parameters(
('_DefaultFLOAT32InputOutput_UseTargetTypesFlag',
lite.constants.FLOAT, False),
('_DefaultFLOAT32InputOutput', lite.constants.FLOAT, True),
('_INT8InputOutput', lite.constants.INT8, True),
('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8, True))
@test_util.run_v2_only
def testPostTrainingIntegerNoFloatQuantization(self,
inference_input_output_type,
use_target_ops_flag):
func, calibration_gen = self._getCalibrationQuantizeModel()
# Convert float model.
converter = lite.TFLiteConverterV2.from_concrete_functions([func])
tflite_model = converter.convert()
self.assertTrue(tflite_model)
# Convert model by specifying target spec (instead of optimizations), since # Convert model by specifying target spec (instead of optimizations), since
# when targeting an integer only backend, quantization is mandatory. # when targeting an integer only backend, quantization is mandatory.
quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func]) quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
quantized_converter.optimizations = [lite.Optimize.DEFAULT] quantized_converter.target_spec.supported_ops = [
lite.OpsSet.TFLITE_BUILTINS_INT8
]
quantized_converter.representative_dataset = calibration_gen quantized_converter.representative_dataset = calibration_gen
if use_target_ops_flag: quantized_converter._experimental_new_quantizer = mlir_quantizer
quantized_converter.target_spec.supported_ops = [ quantized_tflite = quantized_converter.convert()
lite.OpsSet.TFLITE_BUILTINS_INT8 self.assertTrue(quantized_tflite)
]
else:
quantized_converter.target_spec.supported_types = [lite.constants.INT8]
quantized_converter.inference_input_type = inference_input_output_type
quantized_converter.inference_output_type = inference_input_output_type
quantized_tflite_model = quantized_converter.convert()
self.assertTrue(quantized_tflite_model)
interpreter = Interpreter(model_content=quantized_tflite_model) # The default input and output types should be float.
interpreter = Interpreter(model_content=quantized_tflite)
interpreter.allocate_tensors() interpreter.allocate_tensors()
input_details = interpreter.get_input_details() input_details = interpreter.get_input_details()
self.assertLen(input_details, 1) self.assertLen(input_details, 1)
self.assertEqual(inference_input_output_type.as_numpy_dtype, self.assertEqual(np.float32, input_details[0]['dtype'])
input_details[0]['dtype'])
output_details = interpreter.get_output_details() output_details = interpreter.get_output_details()
self.assertLen(output_details, 1) self.assertLen(output_details, 1)
self.assertEqual(inference_input_output_type.as_numpy_dtype, self.assertEqual(np.float32, output_details[0]['dtype'])
output_details[0]['dtype'])
# Ensure that the quantized tflite model is smaller. # Ensure that the quantized weights tflite model is smaller.
self.assertLess(len(quantized_tflite_model), len(tflite_model)) self.assertLess(len(quantized_tflite), len(float_tflite))
def testCalibrateAndQuantizeBuiltinInt16(self): def testCalibrateAndQuantizeBuiltinInt16(self):
func, calibration_gen = self._getCalibrationQuantizeModel() func, calibration_gen = self._getCalibrationQuantizeModel()
@ -374,7 +279,7 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
return tf.keras.Sequential(QLinear(3, input_shape=(2,))) return tf.keras.Sequential(QLinear(3, input_shape=(2,)))
@test_util.run_v2_only @test_util.run_v2_only
def testTrainingTimeQuantization(self): def testTrainingTimeQuantizeConversion(self):
model = self._getTrainingTimeQuantizedModel() model = self._getTrainingTimeQuantizedModel()
float_converter = lite.TFLiteConverterV2.from_keras_model(model) float_converter = lite.TFLiteConverterV2.from_keras_model(model)
@ -392,29 +297,6 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
interpreter = Interpreter(model_content=quantized_tflite) interpreter = Interpreter(model_content=quantized_tflite)
self.assertEqual(np.float32, interpreter.get_input_details()[0]['dtype']) self.assertEqual(np.float32, interpreter.get_input_details()[0]['dtype'])
@parameterized.named_parameters(
('_INT8InputOutput', lite.constants.INT8),
('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
def testInvalidTrainingTimeQuantization(self, inference_input_output_type):
# We currently don't support integer inference_input_type and
# inference_output_type flags for training time quantization.
model = self._getTrainingTimeQuantizedModel()
converter = lite.TFLiteConverterV2.from_keras_model(model)
tflite_model = converter.convert()
self.assertTrue(tflite_model)
quantized_converter = lite.TFLiteConverterV2.from_keras_model(model)
quantized_converter.optimizations = [lite.Optimize.DEFAULT]
with self.assertRaises(ValueError) as error:
quantized_converter.inference_input_type = inference_input_output_type
quantized_converter.inference_output_type = inference_input_output_type
quantized_converter.convert()
self.assertEqual(
'The inference_input_type and inference_output_type '
'must be tf.float32.', str(error.exception))
@test_util.run_v2_only @test_util.run_v2_only
def testNewQuantizer(self): def testNewQuantizer(self):
"""Test the model quantized by the new converter.""" """Test the model quantized by the new converter."""