The auto-mixed-precision pass is actually run before TF-TRT while we want it to run after TF-TRT, see b/170160323.

*** Original change description *** PR #42974: [TF-TRT] Adding AMP support for non converted OPs & native segments Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/42974 This PR adds support for TF-TRT AMP on the Tensorflow OPs that are not converted by TF-TRT and the fallback native segments. **Changes in the API:** - `TrtConversionParams` obtains a new parameter: `allow_mixed_precision_on_unconverted_ops`, True by default. Can be turned off manually. - This behavior is **only active** if `Trt... *** PiperOrigin-RevId: 335658381 Change-Id: Ib3d56c8d22cbc8fad505a92bd3d9b7efea76d79d
2020-10-06 09:39:02 -07:00 · 2020-10-06 09:39:02 -07:00 · 6b59f0662e
commit 6b59f0662e
parent ede1385636
4 changed files with 10 additions and 49 deletions
--- a/RELEASE.md
+++ b/RELEASE.md
@ -296,10 +296,6 @@ h# Release 2.4.0

    *   `tf.debugging.assert_shapes()` now works on `SparseTensor`s (#36268).

-*    `TensorRT`
-    *    Add parameter allow_mixed_precision_on_unconverted_ops to
-         TrtConversionParams.
-
 *   `tf.print`:

    *   Bug fix in `tf.print()` with `OrderedDict` where if an `OrderedDict`
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.h
@ -24,10 +24,6 @@ namespace grappler {

 enum class AutoMixedPrecisionMode { CUDA, MKL };

-// Note: This is primarily used by the tf.experimental.tensorrt.Converter class
-// to use mixed precision on ops not converted by TensorRT. It is also used for
-// the soon-to-be-deprecated enable_mixed_precision_graph_rewrite API.
-//
 // Convert data types to float16 or bfloat16 where appropriate to improve
 // performance on GPUs or CPUs.
 class AutoMixedPrecision : public GraphOptimizer {
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@ -119,7 +119,7 @@ class TrtConversionParams(
        "rewriter_config_template", "max_workspace_size_bytes",
        "precision_mode", "minimum_segment_size", "is_dynamic_op",
        "maximum_cached_engines", "use_calibration", "max_batch_size",
-        "allow_build_at_runtime", "allow_mixed_precision_on_unconverted_ops"
+        "allow_build_at_runtime"
    ])):
  """Parameters that are used for TF-TRT conversion.

@ -160,10 +160,6 @@ class TrtConversionParams(
      inputs during runtime, then a new TensorRT engine is built at runtime if
      allow_build_at_runtime=True, and otherwise native TF is used. This
      argument is only effective if is_dynamic_op=True.
-    allow_mixed_precision_on_unconverted_ops: whether to allow TensorFlow to
-      use mixed precision on the operations which are not converted to inside
-      a TensorRT engine. This argument has a default value of True, and is
-      only effective if the requested `precision_mode` is lower than FP32.
  """

  def __new__(cls,
@ -175,13 +171,13 @@ class TrtConversionParams(
              maximum_cached_engines=1,
              use_calibration=True,
              max_batch_size=1,
-              allow_build_at_runtime=True,
-              allow_mixed_precision_on_unconverted_ops=True):
-    return super(TrtConversionParams, cls).__new__(
-        cls, rewriter_config_template, max_workspace_size_bytes, precision_mode,
-        minimum_segment_size, is_dynamic_op, maximum_cached_engines,
-        use_calibration, max_batch_size, allow_build_at_runtime,
-        allow_mixed_precision_on_unconverted_ops)
+              allow_build_at_runtime=True):
+    return super(TrtConversionParams,
+                 cls).__new__(cls, rewriter_config_template,
+                              max_workspace_size_bytes, precision_mode,
+                              minimum_segment_size, is_dynamic_op,
+                              maximum_cached_engines, use_calibration,
+                              max_batch_size, allow_build_at_runtime)


 DEFAULT_TRT_CONVERSION_PARAMS = TrtConversionParams()
@ -252,12 +248,6 @@ def _check_conversion_params(conversion_params, is_v2=False):
         "allow_build_at_runtime=False. If building TensorRT engines "
         "at runtime is desired, set is_dynamic_op=True."))

-  if not conversion_params.allow_mixed_precision_on_unconverted_ops:
-    tf_logging.warn("Mixed precision on OPs not converted by TF-TRT has been "
-                    "deactivated. We recommend setting: "
-                    "`allow_mixed_precision_on_unconverted_ops=True` for "
-                    "performance reasons.")
-

 def _check_trt_version_compatibility():
  """Check compatibility of TensorRT version.
@ -357,14 +347,6 @@ def get_tensorrt_rewriter_config(conversion_params,
    rewriter_config_with_trt.CopyFrom(
        conversion_params.rewriter_config_template)

-  if (conversion_params.allow_mixed_precision_on_unconverted_ops and
-      conversion_params.precision_mode != TrtPrecisionMode.FP32):
-    rewriter_config_with_trt.auto_mixed_precision = \
-        rewriter_config_pb2.RewriterConfig.ON
-  else:
-    rewriter_config_with_trt.auto_mixed_precision = \
-        rewriter_config_pb2.RewriterConfig.OFF
-
  # Disabling optimizers should happen after CopyFrom the template
  # otherwise the template can overwrite the disablement.
  if disable_non_trt_optimizers:
@ -384,7 +366,6 @@ def get_tensorrt_rewriter_config(conversion_params,
        rewriter_config_pb2.RewriterConfig.NO_MEM_OPT)
    rewriter_config_with_trt.pin_to_host_optimization = off
    rewriter_config_with_trt.auto_parallel.enable = False
-    rewriter_config_with_trt.auto_mixed_precision = off

  return rewriter_config_with_trt

@ -458,8 +439,7 @@ class TrtGraphConverter(object):
               minimum_segment_size=3,
               is_dynamic_op=False,
               maximum_cached_engines=1,
-               use_calibration=True,
-               allow_mixed_precision_on_unconverted_ops=True):
+               use_calibration=True):
    """Initialize the converter.

    Args:
@ -498,10 +478,6 @@ class TrtGraphConverter(object):
        will occur. Please note that accuracy may be negatively affected if
        there is a mismatch between which tensors TRT quantizes and which
        tensors were trained with fake quantization.
-      allow_mixed_precision_on_unconverted_ops: whether to allow TensorFlow to
-        use mixed precision on the operations which are not converted to inside
-        a TensorRT engine. This argument has a default value of True, and is
-        only effective if the requested `precision_mode` is lower than FP32.

    Raises:
      ValueError: if the combination of the parameters is invalid.
@ -561,10 +537,7 @@ class TrtGraphConverter(object):
        maximum_cached_engines=maximum_cached_engines,
        use_calibration=use_calibration,
        max_batch_size=max_batch_size,
-        allow_build_at_runtime=True,
-        allow_mixed_precision_on_unconverted_ops=
-          allow_mixed_precision_on_unconverted_ops
-    )
+        allow_build_at_runtime=True)
    _check_conversion_params(self._conversion_params)

  def _run_conversion(self):
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.tensorrt.-conversion-params.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.tensorrt.-conversion-params.pbtxt
@ -7,10 +7,6 @@ tf_class {
    name: "allow_build_at_runtime"
    mtype: "<type \'property\'>"
  }
-  member {
-    name: "allow_mixed_precision_on_unconverted_ops"
-    mtype: "<type \'property\'>"
-  }
  member {
    name: "is_dynamic_op"
    mtype: "<type \'property\'>"