From a7899d7544230fce8dae4895733d82623af2b934 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 21 Jan 2020 13:18:55 +0000
Subject: [PATCH 001/112] Added an option
 TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 to enable sym quantization
 with activations in 16-bit and weigths in 8-bit.

---
 tensorflow/lite/python/convert.py             |   6 +
 tensorflow/lite/python/lite.py                |  13 +-
 tensorflow/lite/python/lite_constants.py      |   3 +
 tensorflow/lite/python/lite_test.py           |  14 +-
 .../python/optimize/calibration_wrapper.cc    |   8 +-
 .../python/optimize/calibration_wrapper.h     |   3 +-
 tensorflow/lite/python/optimize/calibrator.py |   6 +-
 .../lite/python/optimize/calibrator_test.py   |  39 ++-
 .../lite/tools/optimize/operator_property.cc  |  17 +-
 .../lite/tools/optimize/operator_property.h   |  10 +-
 .../lite/tools/optimize/quantization_utils.cc | 102 +++++--
 .../lite/tools/optimize/quantization_utils.h  |  10 +-
 .../tools/optimize/quantization_utils_test.cc |   4 +-
 .../tools/optimize/quantization_wrapper.cc    |   4 +-
 .../lite/tools/optimize/quantize_model.cc     | 175 +++++++-----
 .../lite/tools/optimize/quantize_model.h      |   7 +-
 .../tools/optimize/quantize_model_test.cc     | 258 ++++++++++++------
 17 files changed, 477 insertions(+), 202 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 2fe4d172487..494f32a515c 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -93,6 +93,12 @@ class OpsSet(enum.Enum):
   # quantized implementations.
   TFLITE_BUILTINS_INT8 = "TFLITE_BUILTINS_INT8"
 
+  # Convert model using only TensorFlow Lite operations with quantized int8 weights
+  # and int16 activations.
+  # Specifying this will throw an error for operations that do not yet have
+  # quantized implementations.
+  TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = "TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+
   def __str__(self):
     return self.value
 
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 657cfea1bb8..fc9c064faf0 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -224,6 +224,10 @@ class TFLiteConverterBase(object):
         self.target_spec.supported_ops) or
             self._smallest_supported_type() == constants.INT8)
 
+  def _is_int16x8_target_required(self):
+    return (set([OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]) ==
+        set(self.target_spec.supported_ops))
+
   def _smallest_supported_type(self):
     if self.target_spec.supported_types:
       return min(self.target_spec.supported_types, key=lambda x: x.size)
@@ -238,7 +242,9 @@ class TFLiteConverterBase(object):
         ]))
 
   def _is_post_training_optimize(self):
-    return self._is_int8_target_required() or self._any_optimization_enabled()
+    return self._is_int8_target_required() or \
+      self._is_int16x8_target_required() or \
+      self._any_optimization_enabled()
 
   def _is_int8_weight_only_quantize(self):
     return (self._is_post_training_optimize() and
@@ -255,11 +261,12 @@ class TFLiteConverterBase(object):
 
   def _calibrate_quantize_model(self, result, inference_input_type,
                                 inference_output_type, enable_mlir_quantizer):
-    allow_float = not self._is_int8_target_required()
+    allow_float = not self._is_int8_target_required() and not self._is_int16x8_target_required()
     calibrate_quantize = _calibrator.Calibrator(result)
+    activations_type = constants.INT16 if self._is_int16x8_target_required() else constants.INT8
     return calibrate_quantize.calibrate_and_quantize(
         self.representative_dataset.input_gen, inference_input_type,
-        inference_output_type, allow_float, enable_mlir_quantizer)
+        inference_output_type, allow_float, activations_type, enable_mlir_quantizer)
 
   def _get_base_converter_args(self):
     """Returns the base converter args.
diff --git a/tensorflow/lite/python/lite_constants.py b/tensorflow/lite/python/lite_constants.py
index d43452c775b..4902f23795e 100644
--- a/tensorflow/lite/python/lite_constants.py
+++ b/tensorflow/lite/python/lite_constants.py
@@ -30,6 +30,7 @@ INT64 = dtypes.int64
 STRING = dtypes.string
 QUANTIZED_UINT8 = dtypes.uint8
 INT8 = dtypes.int8
+INT16 = dtypes.int16
 COMPLEX64 = dtypes.complex64
 TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
 TFLITE = _toco_flags_pb2.TFLITE
@@ -43,6 +44,7 @@ _tf_export(v1=["lite.constants.STRING"]).export_constant(__name__, "STRING")
 _tf_export(v1=["lite.constants.QUANTIZED_UINT8"]).export_constant(
     __name__, "QUANTIZED_UINT8")
 _tf_export(v1=["lite.constants.INT8"]).export_constant(__name__, "INT8")
+_tf_export(v1=["lite.constants.INT16"]).export_constant(__name__, "INT16")
 _tf_export(v1=["lite.constants.TFLITE"]).export_constant(__name__, "TFLITE")
 _tf_export(v1=["lite.constants.GRAPHVIZ_DOT"]).export_constant(
     __name__, "GRAPHVIZ_DOT")
@@ -62,6 +64,7 @@ _allowed_symbols = [
     "STRING",
     "QUANTIZED_UINT8",
     "INT8",
+    "INT16",
     "COMPLEX64",
     "TENSORFLOW_GRAPHDEF",
     "TFLITE",
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 16959c84146..ef5e5d1cdf4 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -769,9 +769,13 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
   @parameterized.named_parameters(
-      ('EnableMlirConverter', True),  # enable mlir
-      ('DisableMlirConverter', False))  # disable mlir
-  def testCalibrateAndQuantizeBuiltinInt8(self, enable_mlir):
+      # Quantize model to Int8: with enable mlir
+      ('UseTfliteBuiltinsIntEnableMLIR', [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
+      # Quantize model to Int8: with disable mlir
+      ('UseTfliteBuiltinsIntDisableMLIR', [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
+      # Quantize model to Int16: with disable mlir
+      ('UseTfliteBuiltinsInt16DisableMLIR', [lite.OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8], False))
+  def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()
       sess = session.Session()
@@ -787,9 +791,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
     quantized_converter.experimental_new_converter = enable_mlir
-    quantized_converter.target_spec.supported_ops = [
-        lite.OpsSet.TFLITE_BUILTINS_INT8
-    ]
+    quantized_converter.target_spec.supported_ops = supported_ops
     quantized_converter.representative_dataset = calibration_gen
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 89ffb3430ea..88995136726 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -204,6 +204,7 @@ PyObject* CalibrationWrapper::SetTensor(int index, PyObject* value) {
 PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
                                             int output_py_type,
                                             bool allow_float,
+                                            int activations_py_type,
                                             bool enable_mlir_quantizer) {
   if (NoOpModel(*model_)) {
     return python_utils::ConvertToPyString(model_str_->data(),
@@ -212,6 +213,9 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
 
   TfLiteType input_type = python_utils::TfLiteTypeFromPyType(input_py_type);
   TfLiteType output_type = python_utils::TfLiteTypeFromPyType(output_py_type);
+  TfLiteType activations_type =
+      python_utils::TfLiteTypeFromPyType(activations_py_type);
+
   if (input_type == kTfLiteNoType || output_type == kTfLiteNoType) {
     PyErr_SetString(PyExc_ValueError,
                     "Input/output type cannot be kTfLiteNoType");
@@ -230,7 +234,7 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
     status = tflite::optimize::QuantizeModel(
         &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
         TfLiteTypeToSchemaType(output_type), allow_float,
-        error_reporter_.get());
+        TfLiteTypeToSchemaType(activations_type), error_reporter_.get());
   }
 
   if (status != kTfLiteOk) {
@@ -262,7 +266,7 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
   auto status = tflite::optimize::QuantizeModel(
       &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
       TfLiteTypeToSchemaType(output_type), allow_float, {op_name},
-      error_reporter_.get());
+      TensorType_INT8, error_reporter_.get());
   if (status != kTfLiteOk) {
     error_reporter_->exception();
     return nullptr;
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.h b/tensorflow/lite/python/optimize/calibration_wrapper.h
index 0fefc29dd81..e72fe15e958 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.h
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.h
@@ -60,7 +60,8 @@ class CalibrationWrapper {
   PyObject* FeedTensor(PyObject* input_value);
 
   PyObject* QuantizeModel(int input_py_type, int output_py_type,
-                          bool allow_float, bool enable_mlir_quantizer = false);
+                          bool allow_float, int activations_py_type,
+                          bool enable_mlir_quantizer = false);
 
   // Allows quantizing only the operator that produces the tensor with name
   // operator_output_name. (This can be used to help debug.).
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index 6d9a29236f0..1f962917551 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python.util.lazy_loader import LazyLoader
+from tensorflow.lite.python import lite_constants
 
 # Lazy load since some of the performance benchmark skylark rules
 # break dependencies. Must use double quotes to match code internal rewrite
@@ -55,7 +56,8 @@ class Calibrator(object):
       raise ValueError("Failed to parse the model.")
 
   def calibrate_and_quantize(self, dataset_gen, input_type, output_type,
-                             allow_float, enable_mlir_quantizer=False):
+                             allow_float, activations_type = lite_constants.INT8,
+                             enable_mlir_quantizer=False):
     """Calibrates the model with specified generator and then quantizes it.
 
     Returns:
@@ -69,6 +71,7 @@ class Calibrator(object):
                    computation, useful when targeting an integer-only backend.
                    If False, an error will be thrown if an operation cannot be
                    quantized, otherwise the model will fallback to float ops.
+      activations_type: A tf.dtype representing the desired type for activations
       enable_mlir_quantizer: A boolean. True if wants to use mlir quantizer to
                              quantize the calibrated model.
     """
@@ -78,6 +81,7 @@ class Calibrator(object):
     return self._calibrator.QuantizeModel(
         np.dtype(input_type.as_numpy_dtype()).num,
         np.dtype(output_type.as_numpy_dtype()).num, allow_float,
+        np.dtype(activations_type.as_numpy_dtype()).num,
         enable_mlir_quantizer)
 
   def calibrate_and_quantize_single(self, dataset_gen, input_type, output_type,
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index 28e8723f23d..7ec5f8f526c 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -33,9 +33,13 @@ from tensorflow.python.platform import test
 class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
-      ('EnableMlirQuantizer', True),  # enable mlir quantizer
-      ('DisableMlirQuantizer', False))  # disable mlir quantizer
-  def test_calibration_with_quantization(self, enable_mlir):
+      # Activation type Int8 - enable mlir quantizer
+      ('UseActivationTypeInt8EnabledMlir', constants.INT8, True),
+      # Activation type Int8 - disable mlir quantizer
+      ('UseActivationTypeInt8DisabledMlir', constants.INT8, False),
+      # Activation type Int16
+      ('UseActivationTypeInt16', constants.INT16, False))
+  def test_calibration_with_quantization(self, activations_type, enable_mlir):
     model_path = resource_loader.get_path_to_datafile(
         'test_data/mobilenet_like_model.bin')
     float_model = open(model_path, 'rb').read()
@@ -49,13 +53,18 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
                                                        constants.FLOAT,
                                                        constants.FLOAT, False,
+                                                       activations_type,
                                                        enable_mlir)
     self.assertIsNotNone(quantized_model)
 
   @parameterized.named_parameters(
-      ('EnableMlirQuantizer', True),  # enable mlir quantizer
-      ('DisableMlirQuantizer', False))  # disable mlir quantizer
-  def test_calibration_with_quantization_allow_float(self, enable_mlir):
+      # Activation type Int8 - enable mlir quantizer
+      ('UseActivationTypeInt8EnabledMlir', constants.INT8, True),
+      # Activation type Int8 - disable mlir quantizer
+      ('UseActivationTypeInt8DisableMlir', constants.INT8, False),
+      # Activation type Int16 - disable mlir quantizer
+      ('UseActivationTypeInt16', constants.INT16, False))
+  def test_calibration_with_quantization_allow_float(self, activations_type, enable_mlir):
     model_path = resource_loader.get_path_to_datafile(
         'test_data/mobilenet_like_model.bin')
     float_model = open(model_path, 'rb').read()
@@ -69,6 +78,7 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
                                                        constants.FLOAT,
                                                        constants.FLOAT, True,
+                                                       activations_type,
                                                        enable_mlir)
     self.assertIsNotNone(quantized_model)
 
@@ -88,9 +98,13 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIsNotNone(quantized_model)
 
   @parameterized.named_parameters(
-      ('EnableMlirQuantizer', True),  # enable mlir quantizer
-      ('DisableMlirQuantizer', False))  # disable mlir quantizer
-  def test_calibration_with_quantization_multiple_inputs(self, enable_mlir):
+      # Activation type Int8 - enable mlir quantizer
+      ('UseActivationTypeInt8 - EnableMlirQuantizer', constants.INT8, True),
+      # Activation type Int8 - disable mlir quantizer
+      ('UseActivationTypeInt8 - DisableMlirQuantizer', constants.INT8, False),
+      # Activation type Int16 - disable mlir quantizer
+      ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', constants.INT16, False))
+  def test_calibration_with_quantization_multiple_inputs(self, activations_type, enable_mlir):
     # Load multi add model from test data.
     # This model has 4 inputs of size (1, 8, 8, 3).
     model_path = resource_loader.get_path_to_datafile(
@@ -106,6 +120,7 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
                                                        constants.FLOAT,
                                                        constants.FLOAT, False,
+                                                       activations_type,
                                                        enable_mlir)
     self.assertIsNotNone(quantized_model)
 
@@ -148,7 +163,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaisesRegex(ValueError, 'Size mismatch'):
       quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, enable_mlir)
+                                       constants.FLOAT, False,
+                                       enable_mlir)
 
   @parameterized.named_parameters(
       ('EnableMlirQuantizer', True),  # enable mlir quantizer
@@ -166,7 +182,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaises(ValueError):
       quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, enable_mlir)
+                                       constants.FLOAT, False,
+                                       constants.INT8, enable_mlir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 13f63092761..1f2d8bb4a4d 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -64,6 +64,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantize_input_as_activations = true;
       break;
     case BuiltinOperator_ARG_MAX:
       property.inputs = {{0, {}}};
@@ -176,7 +177,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // LogSoftmax requires output with 16/256 as scale and 127 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {16.0 / 256.0, 127};
+      tensor_property.restricted_value_int8 = {16.0 / 256.0, 127};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -186,7 +187,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // Logistic requires output with 1/256 as scale and -128 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 256.0, -128};
+      tensor_property.restricted_value_int8 = {1 / 256.0, -128};
+      tensor_property.restricted_value_int16 = {1 / 32768.0, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -741,7 +743,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // L2 Norm requires output with 1/128 as scale and 0 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 128.0, 0};
+      tensor_property.restricted_value_int8 = {1 / 128.0, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -756,6 +758,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.arbitrary_inputs = true;
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
+      property.quantize_input_as_activations = true;
       property.version = 2;
       break;
     case BuiltinOperator_MEAN:
@@ -767,6 +770,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.arbitrary_inputs = true;
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
+      property.quantize_input_as_activations = true;
       property.version = 2;
       break;
     case BuiltinOperator_MUL:
@@ -778,6 +782,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.arbitrary_inputs = true;
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
+      property.restrict_same_input_output_scale = true;
       property.version = 2;
       break;
     case BuiltinOperator_PAD:
@@ -840,7 +845,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // Softmax requires output with 1/256 as scale and -128 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 256.0, -128};
+      tensor_property.restricted_value_int8 = {1 / 256.0, -128};
+      tensor_property.restricted_value_int16 = {1 / 32768.0, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
@@ -866,7 +872,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       // Tanh requires output with 1/128 as scale and 0 as zero point.
       TensorProperty tensor_property;
       tensor_property.restriction = true;
-      tensor_property.restricted_value = {1 / 128.0, 0};
+      tensor_property.restricted_value_int8 = {1 / 128.0, 0};
+      tensor_property.restricted_value_int16 = {1 / 32768.0, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
       break;
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index 5d37aa304e5..23052308568 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -43,7 +43,8 @@ struct TensorProperty {
   // Constraints.
   bool restriction = false;
   // scale/zero_point hardcoded.
-  std::pair<float, int> restricted_value = {0.0, 0};
+  std::pair<float, int> restricted_value_int8 = {0.0, 0};
+  std::pair<float, int> restricted_value_int16 = {0.0, 0};
 
   // Use derived scale.
   bool use_derived_scale = false;
@@ -93,6 +94,13 @@ struct OperatorProperty {
 
   // Op version.
   int version = 1;
+
+  // When we quantize activations into 16 bit and weights into 8 bit,
+  // we want to quantize all inputs, including constant tensors,
+  // for the operators like Add, Mul into 16-bit as well. The constant
+  // inputs are quantized as weights and this variable indicates
+  // that we want to do quantizations of these tensors as activations.
+  bool quantize_input_as_activations = false;
 };
 
 OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index 10680758d72..4bc9686ec2c 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <string>
 
 #include "absl/memory/memory.h"
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -30,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/model_utils.h"
+#include "third_party/eigen3/Eigen/Core"
 
 namespace tflite {
 namespace optimize {
@@ -85,6 +85,46 @@ void GetAsymmetricQuantizationParams(
   quantization_params->zero_point = std::vector<int64_t>(1, zero_point);
 }
 
+void GetSymmetricQuantizationParams(
+    float min, float max, const int half_quant_range,
+    QuantizationParametersT* quantization_params) {
+  // Adjust the boundaries to guarantee 0 is included.
+  min = std::min(min, 0.0f);
+  max = std::max(max, 0.0f);
+  const float scale = std::max(std::abs(max), std::abs(min)) / half_quant_range;
+  int64_t zero_point = 0;
+  quantization_params->min = std::vector<float>(1, min);
+  quantization_params->max = std::vector<float>(1, max);
+  quantization_params->scale = std::vector<float>(1, scale);
+  quantization_params->zero_point = std::vector<int64_t>(1, 0);
+}
+
+TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
+                                   QuantizationParametersT* quantization_params,
+                                   ErrorReporter* error_reporter) {
+  if (activations_type == TensorType_INT8) {
+    GetAsymmetricQuantizationParams(
+        tensor->quantization->min[0], tensor->quantization->max[0],
+        std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
+        quantization_params);
+  } else if (activations_type == TensorType_INT16) {
+    float range = std::max(std::abs(tensor->quantization->min[0]),
+                           std::abs(tensor->quantization->max[0]));
+    const float quantized_range = 32767.0;
+    const float scale = range / quantized_range;
+    quantization_params->min = std::vector<float>(1, -range);
+    quantization_params->max = std::vector<float>(1, range);
+    quantization_params->scale = std::vector<float>(1, scale);
+    quantization_params->zero_point = std::vector<int64_t>(1, 0);
+  } else {
+    error_reporter->Report(
+        "Unsupported activation type for quantize-activation: %s",
+        activations_type);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 // Set the max and min quantization parameter for a single tensor given its
 // values.
 void FillSingleMinMax(const float* const input, const uint64_t input_size,
@@ -536,6 +576,7 @@ TfLiteStatus SymmetricQuantizeTensorPerChannel(ModelT* model, TensorT* tensor,
                                model, tensor, error_reporter);
 }
 
+template <class BiasType>
 TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
                                            float scaling_factor,
                                            ErrorReporter* error_reporter) {
@@ -548,25 +589,38 @@ TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
   uint64_t num_elements;
   TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
 
-  std::vector<int32_t> final_buffer(num_elements);
-  const int32_t kScale = std::numeric_limits<int32_t>::max();
+  std::vector<BiasType> final_buffer(num_elements);
+  const BiasType kScale = std::numeric_limits<BiasType>::max();
 
   for (size_t i = 0; i < num_elements; i++) {
-    const int32_t quantized_value = tflite::SafeCast<int32_t>(
+    const BiasType quantized_value = tflite::SafeCast<BiasType>(
         TfLiteRound(float_data[i] * scaling_factor_inv));
     final_buffer[i] = std::min(kScale, std::max(-kScale, quantized_value));
   }
 
   // Set the buffers and output type.
   uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
-  size_t buffer_size = num_elements * sizeof(int32_t);
+  size_t buffer_size = num_elements * sizeof(BiasType);
   std::vector<float> scales(1, scaling_factor);
   std::vector<int64_t> zero_points(1, 0);
+
+  auto output_type = std::is_same<BiasType, std::int32_t>::value
+                         ? TensorType_INT32
+                         : TensorType_INT64;
   return AddQuantizationParams(scales, zero_points, 0, uint8_buffer,
-                               buffer_size, TensorType_INT32, model, tensor,
+                               buffer_size, output_type, model, tensor,
                                error_reporter);
 }
 
+template TfLiteStatus SymmetricPerLayerBiasQuantize<std::int32_t>(
+    ModelT* model, TensorT* tensor, float scaling_factor,
+    ErrorReporter* error_reporter);
+
+template TfLiteStatus SymmetricPerLayerBiasQuantize<std::int64_t>(
+    ModelT* model, TensorT* tensor, float scaling_factor,
+    ErrorReporter* error_reporter);
+
+template <class BiasType>
 TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
                                              float input_scale,
                                              const float* weight_scales,
@@ -583,14 +637,14 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
   uint64_t num_elements;
   TF_LITE_ENSURE_STATUS(NumElements(*tensor, &num_elements));
 
-  std::vector<int32_t> final_buffer(num_elements);
-  const int32_t kScale = std::numeric_limits<int32_t>::max();
+  std::vector<BiasType> final_buffer(num_elements);
+  const BiasType kScale = std::numeric_limits<BiasType>::max();
 
   for (int32_t channel_idx = 0; channel_idx < number_of_dimension;
        channel_idx++) {
     float scaling_factor = scales[channel_idx];
     float scaling_factor_inv = (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
-    const int32_t quantized_value = tflite::SafeCast<int32_t>(
+    const BiasType quantized_value = tflite::SafeCast<BiasType>(
         TfLiteRound(float_data[channel_idx] * scaling_factor_inv));
     final_buffer[channel_idx] =
         std::min(kScale, std::max(-kScale, quantized_value));
@@ -598,12 +652,26 @@ TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
 
   // Set the buffers and output type.
   uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
-  size_t buffer_size = num_elements * sizeof(int32_t);
+  size_t buffer_size = num_elements * sizeof(BiasType);
   std::vector<int64_t> zero_point(scales.size(), 0);
+
+  auto output_type = std::is_same<BiasType, std::int32_t>::value
+                         ? TensorType_INT32
+                         : TensorType_INT64;
   return AddQuantizationParams(scales, zero_point, 0, uint8_buffer, buffer_size,
-                               TensorType_INT32, model, tensor, error_reporter);
+                               output_type, model, tensor, error_reporter);
 }
 
+template TfLiteStatus SymmetricPerChannelBiasQuantize<std::int64_t>(
+    ModelT* model, TensorT* tensor, float input_scale,
+    const float* weight_scales, int number_of_dimension,
+    ErrorReporter* error_reporter);
+
+template TfLiteStatus SymmetricPerChannelBiasQuantize<std::int32_t>(
+    ModelT* model, TensorT* tensor, float input_scale,
+    const float* weight_scales, int number_of_dimension,
+    ErrorReporter* error_reporter);
+
 TfLiteStatus QuantizeWeight(ModelT* model, TensorT* tensor, bool per_channel,
                             int per_axis_index, ErrorReporter* error_reporter) {
   // TODO(suharshs): Currently we conflate quantizing weights and constants. Its
@@ -645,12 +713,12 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
   return scale;
 }
 
-void QuantizeActivation(TensorT* tensor) {
-  GetAsymmetricQuantizationParams(
-      tensor->quantization->min[0], tensor->quantization->max[0],
-      std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
-      tensor->quantization.get());
-  tensor->type = TensorType_INT8;
+TfLiteStatus QuantizeActivation(TensorT* tensor, TensorType activations_type,
+                                ErrorReporter* error_reporter) {
+  TF_LITE_ENSURE_STATUS(GetQuantizationParams(
+      tensor, activations_type, tensor->quantization.get(), error_reporter));
+  tensor->type = activations_type;
+  return kTfLiteOk;
 }
 
 TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale) {
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.h b/tensorflow/lite/tools/optimize/quantization_utils.h
index 18ed707e175..752b4253250 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.h
+++ b/tensorflow/lite/tools/optimize/quantization_utils.h
@@ -113,12 +113,14 @@ TfLiteStatus SymmetricQuantizeFloatsToInt16(ModelT* model, TensorT* tensor,
                                             ErrorReporter* error_reporter);
 
 // Symmetrically quantized the bias for per-layer ops (i.e. FullyConnected).
+template <typename BiasType>
 TfLiteStatus SymmetricPerLayerBiasQuantize(ModelT* model, TensorT* tensor,
                                            float scaling_factor,
                                            ErrorReporter* error_reporter);
 
 // Symmetrically quantizes the bias for ops like Conv and DepthwiseConv.
 // The scale of bias if weight_per_channel_scale[channel] * input_scale.
+template <typename BiasType>
 TfLiteStatus SymmetricPerChannelBiasQuantize(ModelT* model, TensorT* tensor,
                                              float input_scale,
                                              const float* weight_scales,
@@ -135,8 +137,14 @@ float GetEffectiveScale(ModelT* model, SubGraphT* subgraph, int op_idx,
                         std::vector<int> intermediate_index,
                         std::vector<float> factors);
 
+// Return quantization parameters depending on activations type.
+TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
+                                   QuantizationParametersT* quantization_params,
+                                   ErrorReporter* error_reporter);
+
 // Quantize activation.
-void QuantizeActivation(TensorT* tensor);
+TfLiteStatus QuantizeActivation(TensorT* tensor, TensorType activations_type,
+                                ErrorReporter* error_reporter);
 
 // Quantize activation to 16bit.
 TfLiteStatus QuantizeActivationToInt16(TensorT* tensor, float scale);
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
index ece0123d166..49009e49600 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -701,7 +701,7 @@ TEST_F(QuantizationUtilsTest, SymmetricPerLayerBiasQuantize) {
   model->buffers.push_back(std::move(buffer));
 
   // Call and verify.
-  EXPECT_EQ(SymmetricPerLayerBiasQuantize(
+  EXPECT_EQ(SymmetricPerLayerBiasQuantize<int32_t>(
                 model.get(), model->subgraphs[0]->tensors[0].get(),
                 input_scale * weight_scale, &error_reporter_),
             kTfLiteOk);
@@ -759,7 +759,7 @@ TEST_F(QuantizationUtilsTest, SymmetricPerChannelBiasQuantize) {
   model->buffers.push_back(std::move(buffer));
 
   // Call and verify.
-  EXPECT_EQ(SymmetricPerChannelBiasQuantize(
+  EXPECT_EQ(SymmetricPerChannelBiasQuantize<int32_t>(
                 model.get(), model->subgraphs[0]->tensors[0].get(), input_scale,
                 weight_scales.data(), 2, &error_reporter_),
             kTfLiteOk);
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper.cc b/tensorflow/lite/tools/optimize/quantization_wrapper.cc
index bd3331da6bf..5002c382bc7 100644
--- a/tensorflow/lite/tools/optimize/quantization_wrapper.cc
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper.cc
@@ -42,7 +42,9 @@ bool CreateQuantizedModel(const std::string& path) {
   tflite::StderrReporter error_reporter;
   if (tflite::optimize::QuantizeModel(
           &builder, &model, tflite::TensorType_FLOAT32,
-          tflite::TensorType_FLOAT32, &error_reporter) != kTfLiteOk) {
+          tflite::TensorType_FLOAT32,
+          // TODO: Pass required activation type if needed
+          tflite::TensorType_INT8, &error_reporter) != kTfLiteOk) {
     return false;
   }
   return WriteFile(path, builder.GetBufferPointer(), builder.GetSize());
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 6fc19ff2a56..ee562fe9c4c 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -64,6 +64,7 @@ operator_property::OperatorProperty GetOperatorProperty(
 TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
                           const TensorT* weight_tensor, TensorT* bias_tensor,
                           bool is_per_channel, int channel_dim_index,
+                          const TensorType& activations_type,
                           ErrorReporter* error_reporter) {
   if (bias_tensor->shape.size() != 1) {
     error_reporter->Report("Expected bias tensor shape to be 1.");
@@ -92,9 +93,15 @@ TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
                              weight_scales.size());
       return kTfLiteError;
     }
-    return utils::SymmetricPerChannelBiasQuantize(
-        model, bias_tensor, input_tensor->quantization->scale[0],
-        weight_scales.data(), channel_dim_size, error_reporter);
+    if (activations_type == tflite::TensorType_INT16) {
+      return utils::SymmetricPerChannelBiasQuantize<std::int64_t>(
+          model, bias_tensor, input_tensor->quantization->scale[0],
+          weight_scales.data(), channel_dim_size, error_reporter);
+    } else {
+      return utils::SymmetricPerChannelBiasQuantize<std::int32_t>(
+          model, bias_tensor, input_tensor->quantization->scale[0],
+          weight_scales.data(), channel_dim_size, error_reporter);
+    }
   } else {
     if (weight_scales.size() != 1) {
       error_reporter->Report(
@@ -102,40 +109,54 @@ TfLiteStatus QuantizeBias(ModelT* model, const TensorT* input_tensor,
           weight_scales.size());
       return kTfLiteError;
     }
-    return utils::SymmetricPerLayerBiasQuantize(
-        model, bias_tensor,
-        input_tensor->quantization->scale[0] * weight_scales[0],
-        error_reporter);
+    if (activations_type == tflite::TensorType_INT16) {
+      return utils::SymmetricPerLayerBiasQuantize<std::int64_t>(
+          model, bias_tensor,
+          input_tensor->quantization->scale[0] * weight_scales[0],
+          error_reporter);
+    } else {
+      return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
+          model, bias_tensor,
+          input_tensor->quantization->scale[0] * weight_scales[0],
+          error_reporter);
+    }
   }
   return kTfLiteError;
 }
 
 // True if the tensor type has to be modified.
 bool TensorTypeChangeRequired(const TensorT* tensor, const TensorType& type) {
-  // The quantized model is type INT8, so if the user provided type is INT8, we
-  // do not have to do any custom logic. Additionally, if the current tensor
-  // isn't INT8 quantized, the custom type doesn't apply.
-  return (type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
-          !tensor->quantization->scale.empty());
+  // The quantized model is type INT8/INT16, so if the user provided type is
+  // INT8/INT16, we do not have to do any custom logic. Additionally, if the
+  // current tensor isn't INT8/INT16 quantized, the custom type doesn't apply.
+  bool int8check = type != TensorType_INT8 && tensor->type == TensorType_INT8 &&
+                   !tensor->quantization->scale.empty();
+  bool int16check = type != TensorType_INT16 &&
+                    tensor->type == TensorType_INT16 &&
+                    !tensor->quantization->scale.empty();
+  return (int8check || int16check);
 }
 
 // Sets the input type, adding a Leading Op node at the start of the model if
 // necessary.
 // Returns the new input tensor index.
 int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
-                     const int32_t tensor_idx, const TensorType& input_type) {
+                     const int32_t tensor_idx, const TensorType& input_type,
+                     const TensorType& activations_type) {
   TensorT* tensor = subgraph->tensors[tensor_idx].get();
   if (!TensorTypeChangeRequired(tensor, input_type)) {
     return -1;
   }
   if (input_type == TensorType_FLOAT32 || input_type == TensorType_UINT8) {
+    std::string type_string =
+        activations_type == TensorType_INT16 ? "int16" : "int8";
     // Create a new tensor to be the input of the leading Op.
     std::unique_ptr<TensorT> leading_op_input;
     if (input_type == TensorType_FLOAT32) {
       // Add tensor for quantize operator. Scales and zero points are not
       // needed.
       const string leading_op_name = tensor->name;
-      const string new_name_original_input = tensor->name + "_int8";
+      const string new_name_original_input = tensor->name + "_" + type_string;
       tensor->name = new_name_original_input;
       utils::MakeTensor(leading_op_name, tensor->shape, input_type,
                         &leading_op_input);
@@ -150,7 +171,7 @@ int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
       TFLITE_DCHECK_GE(zero_point, -128);
       TFLITE_DCHECK_LE(zero_point, 127);
       const string leading_op_name = tensor->name;
-      const string new_name_original_input = tensor->name + "_int8";
+      const string new_name_original_input = tensor->name + "_" + type_string;
       tensor->name = new_name_original_input;
       utils::MakeTensorWithQuantParam(leading_op_name, tensor->shape,
                                       input_type, scale, zero_point + 128,
@@ -177,17 +198,20 @@ int32_t SetInputType(ModelT* model, SubGraphT* subgraph,
 // necessary.
 // Returns the new output tensor index.
 int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
-                      const int32_t tensor_idx, const TensorType& output_type) {
+                      const int32_t tensor_idx, const TensorType& output_type,
+                      const TensorType& activations_type) {
   TensorT* tensor = subgraph->tensors[tensor_idx].get();
   if (!TensorTypeChangeRequired(tensor, output_type)) {
     return -1;
   }
   if (output_type == TensorType_FLOAT32 || output_type == TensorType_UINT8) {
+    std::string type_string =
+        activations_type == TensorType_INT16 ? "int16" : "int8";
     // Create a new tensor to be the output of the tailing op.
     std::unique_ptr<TensorT> tailing_op_output;
     if (output_type == TensorType_FLOAT32) {
       const string tailing_op_name = tensor->name;
-      const string new_name_original_output = tensor->name + "_int8";
+      const string new_name_original_output = tensor->name + "_" + type_string;
       tensor->name = new_name_original_output;
       utils::MakeTensor(tailing_op_name, tensor->shape, output_type,
                         &tailing_op_output);
@@ -202,7 +226,7 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
       TFLITE_DCHECK_GE(zero_point, -128);
       TFLITE_DCHECK_LE(zero_point, 127);
       const string tailing_op_name = tensor->name;
-      const string new_name_original_output = tensor->name + "_int8";
+      const string new_name_original_output = tensor->name + "_" + type_string;
       tensor->name = new_name_original_output;
       utils::MakeTensorWithQuantParam(tailing_op_name, tensor->shape,
                                       output_type, scale, zero_point + 128,
@@ -238,6 +262,7 @@ int32_t SetOutputType(ModelT* model, SubGraphT* subgraph,
 // uint8, can be thought as "requant").
 TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
                                     const TensorType& output_type,
+                                    const TensorType& activations_type,
                                     ErrorReporter* error_reporter) {
   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -253,8 +278,8 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
             EnumNameTensorType(tensor->type));
         return kTfLiteError;
       }
-      const int32_t input_idx =
-          SetInputType(model, subgraph, subgraph->inputs[i], input_type);
+      const int32_t input_idx = SetInputType(
+          model, subgraph, subgraph->inputs[i], input_type, activations_type);
       if (input_idx < 0) {
         continue;
       }
@@ -270,8 +295,8 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
             EnumNameTensorType(tensor->type));
         return kTfLiteError;
       }
-      const int32_t output_idx =
-          SetOutputType(model, subgraph, subgraph->outputs[i], output_type);
+      const int32_t output_idx = SetOutputType(
+          model, subgraph, subgraph->outputs[i], output_type, activations_type);
       if (output_idx < 0) {
         continue;
       }
@@ -287,6 +312,7 @@ TfLiteStatus SetInputAndOutputTypes(ModelT* model, const TensorType& input_type,
 // The other ones with constraints are handled in QuantizeWeightsAndInput.
 TfLiteStatus ApplyConstraints(ModelT* model,
                               const std::unordered_set<string>& operator_names,
+                              TensorType activations_type,
                               ErrorReporter* error_reporter) {
   for (int subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -332,7 +358,7 @@ TfLiteStatus ApplyConstraints(ModelT* model,
         std::unique_ptr<TensorT> additional_tensor;
         const string requant_tensor_name = input_tensor->name + "_requantized";
         utils::MakeTensorWithQuantParam(
-            requant_tensor_name, input_tensor->shape, TensorType_INT8,
+            requant_tensor_name, input_tensor->shape, activations_type,
             output_scale, output_zp, &additional_tensor);
         const int32_t additional_tensor_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(additional_tensor));
@@ -382,7 +408,8 @@ std::vector<std::pair<int, operator_property::TensorProperty>> GetOutputs(
 
 bool ShouldRestrictSameInputOutputScale(
     operator_property::OperatorProperty property) {
-  // Ops with multiple inputs (i.e. concat) gets restricted in ApplyConstraints.
+  // Ops with multiple inputs (i.e. concat, max and min) gets restricted in
+  // ApplyConstraints.
   return (!property.arbitrary_inputs &&
           property.restrict_same_input_output_scale);
 }
@@ -401,7 +428,7 @@ TfLiteStatus QuantizeOpInput(
     ModelT* model, int32_t subgraph_idx, size_t* op_idx,
     operator_property::OperatorProperty property,
     const std::pair<int32_t, operator_property::TensorProperty>& input,
-    ErrorReporter* error_reporter) {
+    const TensorType& activations_type, ErrorReporter* error_reporter) {
   int32_t input_idx = input.first;
   operator_property::TensorProperty tensor_property = input.second;
   SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -429,7 +456,9 @@ TfLiteStatus QuantizeOpInput(
     if (utils::HasBuffer(model, subgraph, tensor_idx)) {
       // TODO(suharshs): Look at consumers, throw error if one consumer is
       // per-channel and one per-layer.
-      if (tensor_property.number_of_bits == 8) {
+      bool quantize_const_input = property.quantize_input_as_activations &&
+                                  activations_type == TensorType_INT16;
+      if (tensor_property.number_of_bits == 8 && !quantize_const_input) {
         if (tensor_property.use_derived_scale) {
           // Currently 8bit tensors in input do not accept derived scale.
           return kTfLiteError;
@@ -444,7 +473,7 @@ TfLiteStatus QuantizeOpInput(
               *op_idx);
           return kTfLiteError;
         }
-      } else if (tensor_property.number_of_bits == 16) {
+      } else if (tensor_property.number_of_bits == 16 || quantize_const_input) {
         if (tensor_property.use_derived_scale) {
           // Currently 16bit tensors in input do not accept derived scale.
           return kTfLiteError;
@@ -476,8 +505,8 @@ TfLiteStatus QuantizeOpInput(
             tensor_property.derived_scale.input_tensors,
             tensor_property.derived_scale.intermediate_tensors,
             tensor_property.derived_scale.factors);
-        return utils::SymmetricPerLayerBiasQuantize(model, tensor, scale,
-                                                    error_reporter);
+        return utils::SymmetricPerLayerBiasQuantize<std::int32_t>(
+            model, tensor, scale, error_reporter);
 
       } else if (tensor_property.number_of_bits == 10) {
         // When the number of bits is 10 (instead of 16), quantize the tensor to
@@ -514,7 +543,8 @@ TfLiteStatus QuantizeOpInput(
             // Currently 8bit tensors in input do not accept derived scale.
             return kTfLiteError;
           }
-          utils::QuantizeActivation(tensor);
+          TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
+              tensor, activations_type, error_reporter));
         } else if (tensor_property.number_of_bits == 16) {
           TensorT* tensor = subgraph->tensors[tensor_idx].get();
           float quantized_range = 32767.0;
@@ -532,13 +562,16 @@ TfLiteStatus QuantizeOpInput(
       } else {
         // If the tensor is not a model input, we need to add a Quantize
         // operation since the preceding op may require a float output.
+        std::string type_string =
+            activations_type == TensorType_INT16 ? "int16" : "int8";
         std::unique_ptr<TensorT> op_output;
-        utils::MakeTensor(tensor->name + "_int8", tensor->shape,
-                          TensorType_INT8, &op_output);
+        utils::MakeTensor(tensor->name + "_" + type_string, tensor->shape,
+                          activations_type, &op_output);
         op_output->quantization = absl::make_unique<QuantizationParametersT>();
         op_output->quantization->min.push_back(tensor->quantization->min[0]);
         op_output->quantization->max.push_back(tensor->quantization->max[0]);
-        utils::QuantizeActivation(op_output.get());
+        TF_LITE_ENSURE_STATUS(utils::QuantizeActivation(
+            op_output.get(), activations_type, error_reporter));
         const int32_t quant_op_output_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(op_output));
         std::unique_ptr<OperatorT> quant_op;
@@ -580,7 +613,7 @@ TfLiteStatus QuantizeOpOutput(
     ModelT* model, int32_t subgraph_idx, int32_t op_idx,
     operator_property::OperatorProperty property,
     const std::pair<int32_t, operator_property::TensorProperty>& output,
-    ErrorReporter* error_reporter) {
+    TensorType activations_type, ErrorReporter* error_reporter) {
   int32_t output_idx = output.first;
   operator_property::TensorProperty tensor_property = output.second;
   // If the operator is not quantizable, we don't need to do anything for the
@@ -644,18 +677,22 @@ TfLiteStatus QuantizeOpOutput(
       const float max = input_tensor->quantization->max[0];
       output_tensor->quantization->max = {max};
     }
-    output_tensor->type = TensorType_INT8;
+    output_tensor->type = activations_type;
   } else if (tensor_property.restriction) {
-    const auto scale_and_zp = tensor_property.restricted_value;
+    const auto scale_and_zp = activations_type == TensorType_INT16
+                                  ? tensor_property.restricted_value_int16
+                                  : tensor_property.restricted_value_int8;
+
     // Apply to output.
     output_tensor->quantization = absl::make_unique<QuantizationParametersT>();
     output_tensor->quantization->scale.push_back(scale_and_zp.first);
     output_tensor->quantization->zero_point.push_back(scale_and_zp.second);
-    output_tensor->type = TensorType_INT8;
+    output_tensor->type = activations_type;
   } else {
     // Process regular output that doesn't have any restrictions.
     if (utils::HasMinMax(output_tensor)) {
-      utils::QuantizeActivation(output_tensor);
+      utils::QuantizeActivation(output_tensor, activations_type,
+                                error_reporter);
     } else {
       error_reporter->Report(
           "Unable to find min/max value for output %d in %s in "
@@ -668,6 +705,7 @@ TfLiteStatus QuantizeOpOutput(
 }
 
 TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
+                                        TensorType activations_type,
                                         ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -691,7 +729,8 @@ TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
               input.second.symmetric == false) {
             TensorT* tensor = subgraph->tensors[index_global].get();
             if (utils::HasMinMax(tensor)) {
-              utils::QuantizeActivation(tensor);
+              utils::QuantizeActivation(tensor, activations_type,
+                                        error_reporter);
             } else {
               error_reporter->Report(
                   "Unable to find min/max value for output %d in %s in "
@@ -793,7 +832,7 @@ TfLiteStatus QuantizeSharedRange(ModelT* model, ErrorReporter* error_reporter) {
 TfLiteStatus QuantizeWeightsInputOutput(
     ModelT* model, bool allow_float,
     const std::unordered_set<string>& operator_names,
-    ErrorReporter* error_reporter) {
+    const TensorType& activations_type, ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -815,14 +854,16 @@ TfLiteStatus QuantizeWeightsInputOutput(
       for (const std::pair<int, operator_property::TensorProperty>& input :
            GetInputs(op, property)) {
         TF_LITE_ENSURE_STATUS(QuantizeOpInput(model, subgraph_idx, &op_idx,
-                                              property, input, error_reporter));
+                                              property, input, activations_type,
+                                              error_reporter));
       }
 
       // Quantize operator outputs.
       for (const std::pair<int, operator_property::TensorProperty>& output :
            GetOutputs(op, property)) {
-        TF_LITE_ENSURE_STATUS(QuantizeOpOutput(
-            model, subgraph_idx, op_idx, property, output, error_reporter));
+        TF_LITE_ENSURE_STATUS(
+            QuantizeOpOutput(model, subgraph_idx, op_idx, property, output,
+                             activations_type, error_reporter));
       }
     }
   }
@@ -832,6 +873,7 @@ TfLiteStatus QuantizeWeightsInputOutput(
 // Quantize bias.
 TfLiteStatus QuantizeBiases(ModelT* model,
                             const std::unordered_set<string>& operator_names,
+                            const TensorType& activations_type,
                             ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
@@ -877,10 +919,10 @@ TfLiteStatus QuantizeBiases(ModelT* model,
                 subgraph->tensors[op->inputs[property.inputs[1].first]].get();
             operator_property::TensorProperty weight_property =
                 property.inputs[1].second;
-            TF_LITE_ENSURE_STATUS(
-                QuantizeBias(model, input_tensor, weight_tensor, bias_tensor,
-                             weight_property.per_axis,
-                             weight_property.per_axis_index, error_reporter));
+            TF_LITE_ENSURE_STATUS(QuantizeBias(
+                model, input_tensor, weight_tensor, bias_tensor,
+                weight_property.per_axis, weight_property.per_axis_index,
+                activations_type, error_reporter));
           }
         }
       }
@@ -1000,7 +1042,7 @@ TfLiteStatus FillQuantizationParams(
 // Check compatibility of activation, weight and bias scales. Adjust if needed.
 TfLiteStatus EnsureBiasScaleCompatibility(
     ModelT* model, const std::unordered_set<string>& operator_names,
-    ErrorReporter* error_reporter) {
+    TensorType activations_type, ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
@@ -1049,11 +1091,9 @@ TfLiteStatus EnsureBiasScaleCompatibility(
 
           // Get input scale for assymmetric quantization.
           QuantizationParametersT temp_quant_params = QuantizationParametersT();
-          utils::GetAsymmetricQuantizationParams(
-              input_tensor->quantization->min[0],
-              input_tensor->quantization->max[0],
-              std::numeric_limits<int8_t>::min(),
-              std::numeric_limits<int8_t>::max(), &temp_quant_params);
+          TF_LITE_ENSURE_STATUS(
+              utils::GetQuantizationParams(input_tensor, activations_type,
+                                           &temp_quant_params, error_reporter));
           if (temp_quant_params.scale.size() != 1) {
             error_reporter->Report("Unexpected input quantization scale size.");
             return kTfLiteError;
@@ -1132,21 +1172,24 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
                            const std::unordered_set<string>& operator_names,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter) {
   TF_LITE_ENSURE_STATUS(
       FillQuantizationParams(model, operator_names, error_reporter));
+  TF_LITE_ENSURE_STATUS(EnsureBiasScaleCompatibility(
+      model, operator_names, activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(
-      EnsureBiasScaleCompatibility(model, operator_names, error_reporter));
-  TF_LITE_ENSURE_STATUS(QuantizeIntemediateTensors(model, error_reporter));
+      QuantizeIntemediateTensors(model, activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(QuantizeSharedRange(model, error_reporter));
   TF_LITE_ENSURE_STATUS(QuantizeWeightsInputOutput(
-      model, allow_float, operator_names, error_reporter));
+      model, allow_float, operator_names, activations_type, error_reporter));
+  TF_LITE_ENSURE_STATUS(ApplyConstraints(model, operator_names,
+                                         activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(
-      ApplyConstraints(model, operator_names, error_reporter));
-  TF_LITE_ENSURE_STATUS(QuantizeBiases(model, operator_names, error_reporter));
+      QuantizeBiases(model, operator_names, activations_type, error_reporter));
   utils::SetOperatorCodeVersion(model);
-  TF_LITE_ENSURE_STATUS(
-      SetInputAndOutputTypes(model, input_type, output_type, error_reporter));
+  TF_LITE_ENSURE_STATUS(SetInputAndOutputTypes(
+      model, input_type, output_type, activations_type, error_reporter));
 
   flatbuffers::Offset<Model> output_model_location =
       Model::Pack(*builder, model);
@@ -1158,23 +1201,27 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, input_type, output_type, allow_float,
-                       GetAllOperatorOutputs(model), error_reporter);
+                       GetAllOperatorOutputs(model), activations_type,
+                       error_reporter);
 }
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
                            const TensorType& output_type,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, input_type, output_type,
-                       /*allow_float=*/false, error_reporter);
+                       /*allow_float=*/false, activations_type, error_reporter);
 }
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* model, ErrorReporter* error_reporter) {
+                           ModelT* model, const TensorType& activations_type,
+                           ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
-                       /*allow_float=*/false, error_reporter);
+                       /*allow_float=*/false, activations_type, error_reporter);
 }
 
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index 9b0353f6b6b..cc801ec9870 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -35,7 +35,9 @@ namespace optimize {
 //
 // Note: This is a private API, subject to change.
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* input_model, ErrorReporter* error_reporter);
+                           ModelT* input_model,
+                           const TensorType& activations_type,
+                           ErrorReporter* error_reporter);
 
 // Same as above, but the types of quantized inputs and outputs are
 // configurable.
@@ -44,6 +46,7 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
 // Same as above, but can enable allowing float intermediate operations for ops
@@ -53,6 +56,7 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
 // Same as above, but enables only quantizing a whitelist of operations,
@@ -63,6 +67,7 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
                            const std::unordered_set<string>& operator_names,
+                           const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index da1b293c84b..166d60ecc66 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -80,28 +80,35 @@ class QuantizeModelTest : public testing::Test {
   internal::FailOnErrorReporter error_reporter_;
 };
 
-class QuantizeConvModelTest : public QuantizeModelTest {
+class QuantizeConvModelTest : public QuantizeModelTest,
+                              public testing::WithParamInterface<TensorType> {
  protected:
   QuantizeConvModelTest() {
+    tensor_type_ = GetParam();
     input_model_ = ReadModel(internal::kConvModelWith0Plus10Weights);
     readonly_model_ = input_model_->GetModel();
     readonly_model_->UnPackTo(&model_);
   }
+  TensorType tensor_type_;
 };
 
-TEST_F(QuantizeConvModelTest, QuantizationSucceeds) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+INSTANTIATE_TEST_SUITE_P(QuantizeConvModelTestInst, QuantizeConvModelTest,
+                         testing::ValuesIn({TensorType_INT8,
+                                            TensorType_INT16}));
+
+TEST_P(QuantizeConvModelTest, QuantizationSucceeds) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const uint8_t* buffer = builder_.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
   ASSERT_TRUE(output_model);
 }
 
-TEST_F(QuantizeConvModelTest, SkipUnspecifiedLayer) {
+TEST_P(QuantizeConvModelTest, SkipUnspecifiedLayer) {
   auto status =
       QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
-                    /*allow_float=*/true, {}, &error_reporter_);
+                    /*allow_float=*/true, {}, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
   // The resulting model should be the same.
@@ -123,9 +130,9 @@ TEST_F(QuantizeConvModelTest, SkipUnspecifiedLayer) {
   }
 }
 
-TEST_F(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
   for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -148,9 +155,9 @@ TEST_F(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
   EXPECT_EQ(model_.operator_codes[0]->version, 3);
 }
 
-TEST_F(QuantizeConvModelTest, OperatorsAreUnchanged) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, OperatorsAreUnchanged) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.operator_codes.size(),
             readonly_model_->operator_codes()->size());
@@ -182,20 +189,28 @@ TEST_F(QuantizeConvModelTest, OperatorsAreUnchanged) {
   }
 }
 
-TEST_F(QuantizeConvModelTest, GraphIsFullyQuantized) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   for (const auto& subgraph : model_.subgraphs) {
     for (const auto& tensor : subgraph->tensors) {
-      EXPECT_TRUE(tensor->type == TensorType_INT32 ||
-                  tensor->type == TensorType_INT8);
+      if (tensor_type_ == TensorType_INT8) {
+        EXPECT_TRUE(tensor->type == TensorType_INT32 ||
+                    tensor->type == TensorType_INT8);
+      } else if (tensor_type_ == TensorType_INT16) {
+        EXPECT_TRUE(tensor->type == TensorType_INT64 ||  // bias
+                    tensor->type == TensorType_INT8 ||   // weights
+                    tensor->type == TensorType_INT16);   // activations
+      }
     }
   }
 }
 
-TEST_F(QuantizeConvModelTest, FloatInputAndOutput) {
-  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+TEST_P(QuantizeConvModelTest, FloatInputAndOutput) {
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+                    tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   for (int32_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -234,22 +249,33 @@ TEST_F(QuantizeConvModelTest, FloatInputAndOutput) {
     EXPECT_EQ(subgraph->tensors[output_idx]->type, TensorType_FLOAT32);
     EXPECT_EQ(subgraph->tensors[output_idx]->name, "output");
     // The original input and output has been renamed.
-    EXPECT_EQ(subgraph->tensors[quant_op->outputs[0]]->name, "input_int8");
-    EXPECT_EQ(subgraph->tensors[dequant_op->inputs[0]]->name, "output_int8");
+    std::string control_suffix =
+        (tensor_type_ == TensorType_INT16) ? "int16" : "int8";
+    EXPECT_EQ(subgraph->tensors[quant_op->outputs[0]]->name,
+              "input_" + control_suffix);
+    EXPECT_EQ(subgraph->tensors[dequant_op->inputs[0]]->name,
+              "output_" + control_suffix);
     for (int tensor_idx = 0; tensor_idx < subgraph->tensors.size();
          ++tensor_idx) {
       const auto& tensor = subgraph->tensors[tensor_idx];
       if (input_idx != tensor_idx && output_idx != tensor_idx) {
-        EXPECT_TRUE(tensor->type == TensorType_INT32 ||
-                    tensor->type == TensorType_INT8);
+        if (tensor_type_ == TensorType_INT8) {
+          EXPECT_TRUE(tensor->type == TensorType_INT32 ||
+                      tensor->type == TensorType_INT8);
+        } else if (tensor_type_ == TensorType_INT16) {
+          EXPECT_TRUE(tensor->type == TensorType_INT64 ||  // bias
+                      tensor->type == TensorType_INT8 ||   // weights
+                      tensor->type == TensorType_INT16);   // activations
+        }
       }
     }
   }
 }
 
-TEST_F(QuantizeConvModelTest, Uint8InputAndOutput) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_UINT8,
-                              TensorType_UINT8, &error_reporter_);
+TEST_P(QuantizeConvModelTest, Uint8InputAndOutput) {
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_UINT8, TensorType_UINT8,
+                    TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   for (int32_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -326,21 +352,25 @@ class QuantizeConvNoBiasModelTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConvNoBiasModelTest, QuantizationSucceeds) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const uint8_t* buffer = builder_.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
   ASSERT_TRUE(output_model);
 }
 
-class QuantizeConcatModelTest : public QuantizeModelTest {
+class QuantizeConcatModelTest : public QuantizeModelTest,
+                                public testing::WithParamInterface<TensorType> {
  protected:
   QuantizeConcatModelTest() {
     input_model_ = ReadModel(internal::kFloatConcatMax5Max10Max10);
     readonly_model_ = input_model_->GetModel();
     readonly_model_->UnPackTo(&model_);
   }
+
+  TensorType tensor_type_;
 };
 
 // There are two inputs for concat, "input0" and "input1". "input0" has [0, 5]
@@ -352,9 +382,9 @@ class QuantizeConcatModelTest : public QuantizeModelTest {
 // input0 -> requant -> input0_requant \
 //                                       concat - output
 //                              input1 /
-TEST_F(QuantizeConcatModelTest, AddRequantBeforeConcat) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+TEST_P(QuantizeConcatModelTest, AddRequantBeforeConcat) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   // There is only one subgraph.
@@ -373,32 +403,51 @@ TEST_F(QuantizeConcatModelTest, AddRequantBeforeConcat) {
   EXPECT_EQ(model_.operator_codes[concat->opcode_index]->builtin_code,
             BuiltinOperator_CONCATENATION);
 
+  auto zero_point_control = tensor_type_ == TensorType_INT8 ? -128 : 0;
+  /*
+     input0_scale_control
+        INT8: (5-0) / (2^8 - 1)
+        INT16: (5-0) / (2^16 / 2 - 1)
+     input1_scale
+        INT8: (10-0) / (2^8 - 1)
+        INT16: (10-0) / (2^16 / 2 - 1)
+  */
+  auto input0_scale_control =
+      tensor_type_ == TensorType_INT8 ? 0.019607844 : 0.00015259254;
+  auto input1_scale =
+      tensor_type_ == TensorType_INT8 ? 0.039215688 : 0.00030518509;
+
   // There should be 4 tensors: input0, input1, input0_requantized, output.
   EXPECT_EQ(subgraph->tensors.size(), 4);
-  EXPECT_EQ(subgraph->tensors[0]->type, TensorType_INT8);
+  EXPECT_EQ(subgraph->tensors[0]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[0]->name, "input0");
   EXPECT_EQ(subgraph->tensors[0]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[0]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->scale[0], 0.019607844);
-  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->zero_point[0], -128);
-  EXPECT_EQ(subgraph->tensors[1]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->scale[0],
+                  input0_scale_control);
+  EXPECT_FLOAT_EQ(subgraph->tensors[0]->quantization->zero_point[0],
+                  zero_point_control);
+  EXPECT_EQ(subgraph->tensors[1]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[1]->name, "input1");
   EXPECT_EQ(subgraph->tensors[1]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[1]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->scale[0], 0.039215688);
-  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->zero_point[0], -128);
-  EXPECT_EQ(subgraph->tensors[2]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->scale[0], input1_scale);
+  EXPECT_FLOAT_EQ(subgraph->tensors[1]->quantization->zero_point[0],
+                  zero_point_control);
+  EXPECT_EQ(subgraph->tensors[2]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[2]->name, "output");
   EXPECT_EQ(subgraph->tensors[2]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[2]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->scale[0], 0.039215688);
-  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->zero_point[0], -128);
-  EXPECT_EQ(subgraph->tensors[3]->type, TensorType_INT8);
+  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->scale[0], input1_scale);
+  EXPECT_FLOAT_EQ(subgraph->tensors[2]->quantization->zero_point[0],
+                  zero_point_control);
+  EXPECT_EQ(subgraph->tensors[3]->type, tensor_type_);
   EXPECT_EQ(subgraph->tensors[3]->name, "input0_requantized");
   EXPECT_EQ(subgraph->tensors[3]->quantization->scale.size(), 1);
   EXPECT_EQ(subgraph->tensors[3]->quantization->zero_point.size(), 1);
-  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->scale[0], 0.039215688);
-  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->zero_point[0], -128);
+  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->scale[0], input1_scale);
+  EXPECT_FLOAT_EQ(subgraph->tensors[3]->quantization->zero_point[0],
+                  zero_point_control);
 
   // The connection should be what is described in the comment.
   EXPECT_EQ(requant->inputs.size(), 1);
@@ -419,7 +468,9 @@ TEST_F(QuantizeConcatModelTest, AddRequantBeforeConcat) {
   EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_QUANTIZE);
   EXPECT_EQ(model_.operator_codes[1]->version, 2);
 }
-
+INSTANTIATE_TEST_SUITE_P(QuantizeConcatModelInst, QuantizeConcatModelTest,
+                         testing::ValuesIn({TensorType_INT8,
+                                            TensorType_INT16}));
 class QuantizeSplitModelTest : public QuantizeModelTest {
  protected:
   QuantizeSplitModelTest() {
@@ -432,8 +483,9 @@ class QuantizeSplitModelTest : public QuantizeModelTest {
 // There are two outputs for split with different scales, the resulting model
 // should have the scales be hardcodes to the input scale value.
 TEST_F(QuantizeSplitModelTest, QuantizeSplit) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   // There is only one subgraph.
@@ -496,8 +548,9 @@ class QuantizeConvModel1Test : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const auto& subgraph = model_.subgraphs[0];
 
@@ -587,18 +640,25 @@ TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
   EXPECT_EQ(model_.operator_codes[0]->version, 3);
 }
 
-class QuantizeConvModel2Test : public QuantizeModelTest {
+class QuantizeConvModel2Test : public QuantizeModelTest,
+                               public testing::WithParamInterface<TensorType> {
  protected:
   QuantizeConvModel2Test() {
+    tensor_type_ = GetParam();
     input_model_ = ReadModel(internal::kConvModelWith0Plus10Weights);
     readonly_model_ = input_model_->GetModel();
     readonly_model_->UnPackTo(&model_);
   }
-};
 
-TEST_F(QuantizeConvModel2Test, VerifyConvQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  TensorType tensor_type_;
+};
+INSTANTIATE_TEST_SUITE_P(QuantizeConvModel2TestInst, QuantizeConvModel2Test,
+                         testing::ValuesIn({TensorType_INT8,
+                                            TensorType_INT16}));
+
+TEST_P(QuantizeConvModel2Test, VerifyConvQuantization) {
+  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                              tensor_type_, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto conv_op = subgraph->operators[0].get();
@@ -615,8 +675,10 @@ TEST_F(QuantizeConvModel2Test, VerifyConvQuantization) {
   const auto output_tensor =
       subgraph->tensors[conv_op->outputs[output_tensor_idx]].get();
 
-  EXPECT_EQ(bias_tensor->type, TensorType_INT32);
-  EXPECT_EQ(input_tensor->type, TensorType_INT8);
+  EXPECT_EQ(bias_tensor->type, tensor_type_ == TensorType_INT8
+                                   ? TensorType_INT32
+                                   : TensorType_INT64);
+  EXPECT_EQ(input_tensor->type, tensor_type_);
   EXPECT_EQ(weights_tensor->type, TensorType_INT8);
 
   ASSERT_TRUE(weights_tensor->quantization);
@@ -644,17 +706,28 @@ TEST_F(QuantizeConvModel2Test, VerifyConvQuantization) {
   }
 
   const auto bias_buffer = model_.buffers[bias_tensor->buffer].get();
-  ASSERT_EQ(bias_buffer->data.size(), sizeof(int32_t) * bias_tensor->shape[0]);
-  const int32_t* bias_values =
-      reinterpret_cast<int32_t*>(bias_buffer->data.data());
+  auto control_size = tensor_type_ == TensorType_INT8
+                          ? sizeof(int32_t) * bias_tensor->shape[0]
+                          : sizeof(int64_t) * bias_tensor->shape[0];
+
+  ASSERT_EQ(bias_buffer->data.size(), control_size);
   const auto original_bias_buffer =
       readonly_model_->buffers()->Get(bias_tensor->buffer);
   const float* bias_float_buffer =
       reinterpret_cast<const float*>(original_bias_buffer->data()->data());
 
-  for (size_t i = 0; i < out_channel_size; i++) {
-    auto dequantized_value = bias_values[i] * bias_scales[i];
-    EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+  if (tensor_type_ == TensorType_INT8) {
+    int32_t* bias_values = reinterpret_cast<int32_t*>(bias_buffer->data.data());
+    for (size_t i = 0; i < out_channel_size; i++) {
+      auto dequantized_value = bias_values[i] * bias_scales[i];
+      EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+    }
+  } else if (tensor_type_ == TensorType_INT16) {
+    int64_t* bias_values = reinterpret_cast<int64_t*>(bias_buffer->data.data());
+    for (size_t i = 0; i < out_channel_size; i++) {
+      auto dequantized_value = bias_values[i] * bias_scales[i];
+      EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+    }
   }
 
   const auto weights_buffer = model_.buffers[weights_tensor->buffer].get();
@@ -695,8 +768,9 @@ class QuantizeSoftmaxTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeSoftmaxTest, VerifySoftmaxQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -755,8 +829,9 @@ class QuantizeAvgPoolTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeAvgPoolTest, VerifyAvgPoolQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -816,8 +891,9 @@ class QuantizeMultiInputAddWithReshapeTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyReshapeQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Verify Reshape is quantized.
@@ -863,8 +939,9 @@ TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyReshapeQuantization) {
 }
 
 TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyAddQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Verify ADD is quantized.
@@ -923,8 +1000,9 @@ class QuantizeConstInputTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConstInputTest, VerifyConstOpInput) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Verify ConstOp is quantized.
@@ -965,8 +1043,9 @@ class QuantizeArgMaxTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeArgMaxTest, VerifyArgMax) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -1008,8 +1087,9 @@ class QuantizeLSTMTest : public QuantizeModelTest {
 
 TEST_F(QuantizeLSTMTest, VerifyLSTM) {
   // Quantize model.
-  auto status = QuantizeModel(&builder_, &model_, TensorType_FLOAT32,
-                              TensorType_FLOAT32, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Read expected model.
@@ -1067,8 +1147,9 @@ class QuantizeLSTM2Test : public QuantizeModelTest {
 
 TEST_F(QuantizeLSTM2Test, VerifyLSTM) {
   // Quantize model.
-  auto status = QuantizeModel(&builder_, &model_, TensorType_FLOAT32,
-                              TensorType_FLOAT32, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Read expected model.
@@ -1126,8 +1207,9 @@ class QuantizeSVDFTest : public QuantizeModelTest {
 
 TEST_F(QuantizeSVDFTest, VerifySVDF) {
   // Quantize model.
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   // Read expected model.
@@ -1184,8 +1266,9 @@ class QuantizeFCTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeFCTest, VerifyFC) {
-  auto status = QuantizeModel(&builder_, &model_, TensorType_INT8,
-                              TensorType_INT8, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+                    TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
 
   const auto& subgraph = model_.subgraphs[0];
@@ -1236,7 +1319,7 @@ class QuantizeCustomOpTest : public QuantizeModelTest {
 TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
   auto status =
       QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
-                    /*allow_float=*/true, &error_reporter_);
+                    /*allow_float=*/true, TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto float_graph = readonly_model_->subgraphs()->Get(0);
@@ -1270,7 +1353,8 @@ class QuantizePackTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizePackTest, VerifyPack) {
-  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
 
   ASSERT_EQ(kTfLiteOk, status);
 
@@ -1334,7 +1418,8 @@ class QuantizeMinimumMaximumTest
 };
 
 TEST_P(QuantizeMinimumMaximumTest, VerifyMinimumMaximum) {
-  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
 
@@ -1415,7 +1500,8 @@ class QuantizeUnpackTest : public QuantizeModelTest {
   }
 };
 TEST_F(QuantizeUnpackTest, VerifyUnpack) {
-  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
 
   ASSERT_EQ(kTfLiteOk, status);
 

From de6afc5d6b509c8f3d709bf1e275373864ec0936 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 12 Feb 2020 13:51:24 +0000
Subject: [PATCH 002/112] Changed per reviewer comments.

---
 .../python/optimize/calibration_wrapper.cc    |  3 +-
 .../lite/tools/optimize/quantization_utils.cc | 10 ++-----
 .../lite/tools/optimize/quantize_model.cc     | 19 +++++++++----
 .../lite/tools/optimize/quantize_model.h      | 25 +++++++++++++----
 .../tools/optimize/quantize_model_test.cc     | 28 +++++++++----------
 5 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 88995136726..cdc8adaaf2b 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -266,7 +266,8 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
   auto status = tflite::optimize::QuantizeModel(
       &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
       TfLiteTypeToSchemaType(output_type), allow_float, {op_name},
-      TensorType_INT8, error_reporter_.get());
+      TensorType_INT8,
+      error_reporter_.get());
   if (status != kTfLiteOk) {
     error_reporter_->exception();
     return nullptr;
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index 4bc9686ec2c..ba43416cf04 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -108,14 +108,10 @@ TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
         std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
         quantization_params);
   } else if (activations_type == TensorType_INT16) {
-    float range = std::max(std::abs(tensor->quantization->min[0]),
-                           std::abs(tensor->quantization->max[0]));
     const float quantized_range = 32767.0;
-    const float scale = range / quantized_range;
-    quantization_params->min = std::vector<float>(1, -range);
-    quantization_params->max = std::vector<float>(1, range);
-    quantization_params->scale = std::vector<float>(1, scale);
-    quantization_params->zero_point = std::vector<int64_t>(1, 0);
+    GetSymmetricQuantizationParams(tensor->quantization->min[0],
+                                   tensor->quantization->max[0],
+                                   quantized_range, quantization_params);
   } else {
     error_reporter->Report(
         "Unsupported activation type for quantize-activation: %s",
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index ee562fe9c4c..bbb40080fbc 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -1210,18 +1210,25 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* model, const TensorType& input_type,
-                           const TensorType& output_type,
-                           const TensorType& activations_type,
+                           const TensorType& output_type, bool allow_float,
                            ErrorReporter* error_reporter) {
-  return QuantizeModel(builder, model, input_type, output_type,
-                       /*allow_float=*/false, activations_type, error_reporter);
+  return QuantizeModel(builder, model, input_type, output_type, allow_float,
+                       GetAllOperatorOutputs(model), TensorType_INT8,
+                       error_reporter);
 }
 
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* model, const TensorType& activations_type,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type,
                            ErrorReporter* error_reporter) {
+  return QuantizeModel(builder, model, input_type, output_type,
+                       /*allow_float=*/false, error_reporter);
+}
+
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, TensorType_FLOAT32, TensorType_FLOAT32,
-                       /*allow_float=*/false, activations_type, error_reporter);
+                       /*allow_float=*/false, error_reporter);
 }
 
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index cc801ec9870..06c30b88fd0 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -35,9 +35,7 @@ namespace optimize {
 //
 // Note: This is a private API, subject to change.
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* input_model,
-                           const TensorType& activations_type,
-                           ErrorReporter* error_reporter);
+                           ModelT* input_model, ErrorReporter* error_reporter);
 
 // Same as above, but the types of quantized inputs and outputs are
 // configurable.
@@ -46,7 +44,6 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type,
-                           const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
 // Same as above, but can enable allowing float intermediate operations for ops
@@ -56,7 +53,6 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
-                           const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
 // Same as above, but enables only quantizing a whitelist of operations,
@@ -67,6 +63,25 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            ModelT* input_model, const TensorType& input_type,
                            const TensorType& output_type, bool allow_float,
                            const std::unordered_set<string>& operator_names,
+                           ErrorReporter* error_reporter);
+
+// Same as above, but enables to provide activation type, which
+// could be TensorType_INT16 or TensorType_INT8.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           const TensorType& activations_type,
+                           ErrorReporter* error_reporter);
+
+// Quantizes input_model and populates the provided builder with the new model
+// with all possible input parameters.
+// All functions above call this function underneath.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, const TensorType& input_type,
+                           const TensorType& output_type, bool allow_float,
+                           const std::unordered_set<string>& operator_names,
                            const TensorType& activations_type,
                            ErrorReporter* error_reporter);
 
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index 166d60ecc66..ef46b3fbd5d 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -106,9 +106,9 @@ TEST_P(QuantizeConvModelTest, QuantizationSucceeds) {
 }
 
 TEST_P(QuantizeConvModelTest, SkipUnspecifiedLayer) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
-                    /*allow_float=*/true, {}, tensor_type_, &error_reporter_);
+  auto status = QuantizeModel(
+      &builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+      /*allow_float=*/true, {}, TensorType_FLOAT32, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
   // The resulting model should be the same.
@@ -190,8 +190,9 @@ TEST_P(QuantizeConvModelTest, OperatorsAreUnchanged) {
 }
 
 TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
-  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                              tensor_type_, &error_reporter_);
+  auto status =
+      QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
+                    /*allow_float*/ false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   for (const auto& subgraph : model_.subgraphs) {
     for (const auto& tensor : subgraph->tensors) {
@@ -210,7 +211,7 @@ TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
 TEST_P(QuantizeConvModelTest, FloatInputAndOutput) {
   auto status =
       QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
-                    tensor_type_, &error_reporter_);
+                    /*allow_float*/ false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   for (int32_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -384,7 +385,7 @@ class QuantizeConcatModelTest : public QuantizeModelTest,
 //                              input1 /
 TEST_P(QuantizeConcatModelTest, AddRequantBeforeConcat) {
   auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                              tensor_type_, &error_reporter_);
+                              false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   // There is only one subgraph.
@@ -549,7 +550,7 @@ class QuantizeConvModel1Test : public QuantizeModelTest {
 
 TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
   auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
+      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8, false,
                     TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const auto& subgraph = model_.subgraphs[0];
@@ -658,7 +659,7 @@ INSTANTIATE_TEST_SUITE_P(QuantizeConvModel2TestInst, QuantizeConvModel2Test,
 
 TEST_P(QuantizeConvModel2Test, VerifyConvQuantization) {
   auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                              tensor_type_, &error_reporter_);
+                              false, tensor_type_, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto conv_op = subgraph->operators[0].get();
@@ -1353,8 +1354,7 @@ class QuantizePackTest : public QuantizeModelTest {
 };
 
 TEST_F(QuantizePackTest, VerifyPack) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
 
   ASSERT_EQ(kTfLiteOk, status);
 
@@ -1418,8 +1418,7 @@ class QuantizeMinimumMaximumTest
 };
 
 TEST_P(QuantizeMinimumMaximumTest, VerifyMinimumMaximum) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
 
@@ -1500,8 +1499,7 @@ class QuantizeUnpackTest : public QuantizeModelTest {
   }
 };
 TEST_F(QuantizeUnpackTest, VerifyUnpack) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
 
   ASSERT_EQ(kTfLiteOk, status);
 

From 792f553fd078a425d66c81567ca8f3588d44fcdc Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 5 Feb 2020 11:55:27 +0000
Subject: [PATCH 003/112] Added non-strict mode for 16x8 quantization

---
 tensorflow/lite/python/lite.py                |  16 ++++--
 tensorflow/lite/tools/optimize/BUILD          |   1 +
 .../lite/tools/optimize/operator_property.cc  |  16 ++++++
 .../lite/tools/optimize/operator_property.h   |   3 +-
 .../lite/tools/optimize/quantize_model.cc     |  50 +++++++++++-------
 .../tools/optimize/quantize_model_test.cc     |  50 +++++++++++++++---
 tensorflow/lite/tools/optimize/test_util.cc   |   1 +
 tensorflow/lite/tools/optimize/test_util.h    |   5 ++
 .../tools/optimize/testdata/mixed16x8.bin     | Bin 0 -> 1184 bytes
 9 files changed, 111 insertions(+), 31 deletions(-)
 create mode 100644 tensorflow/lite/tools/optimize/testdata/mixed16x8.bin

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index fc9c064faf0..1e0c89d3aa5 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -220,13 +220,16 @@ class TFLiteConverterBase(object):
                          "type to be INT8.")
 
   def _is_int8_target_required(self):
-    return (set([OpsSet.TFLITE_BUILTINS_INT8]) == set(
+    return ((set([OpsSet.TFLITE_BUILTINS_INT8]) == set(
         self.target_spec.supported_ops) or
-            self._smallest_supported_type() == constants.INT8)
+            self._smallest_supported_type() == constants.INT8) and
+        not self._is_int16x8_target_required())
 
   def _is_int16x8_target_required(self):
-    return (set([OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]) ==
-        set(self.target_spec.supported_ops))
+    return bool(
+          set(self.target_spec.supported_ops).intersection([
+            OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]))
 
   def _smallest_supported_type(self):
     if self.target_spec.supported_types:
@@ -262,6 +265,11 @@ class TFLiteConverterBase(object):
   def _calibrate_quantize_model(self, result, inference_input_type,
                                 inference_output_type, enable_mlir_quantizer):
     allow_float = not self._is_int8_target_required() and not self._is_int16x8_target_required()
+    if (self._is_int16x8_target_required()):
+      allow_float = bool(
+        set(self.target_spec.supported_ops).intersection([
+            OpsSet.TFLITE_BUILTINS
+        ]))
     calibrate_quantize = _calibrator.Calibrator(result)
     activations_type = constants.INT16 if self._is_int16x8_target_required() else constants.INT8
     return calibrate_quantize.calibrate_and_quantize(
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 27be0f829ba..ee5e845b96b 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -245,6 +245,7 @@ tf_cc_test(
         "//tensorflow/lite/tools/optimize:testdata/maximum.bin",
         "//tensorflow/lite/tools/optimize:testdata/minimum.bin",
         "//tensorflow/lite/tools/optimize:testdata/mixed.bin",
+        "//tensorflow/lite/tools/optimize:testdata/mixed16x8.bin",
         "//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin",
         "//tensorflow/lite/tools/optimize:testdata/pack.bin",
         "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 1f2d8bb4a4d..c31ad9dbb1e 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -70,6 +70,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       // ArgMax has no quantizable output.
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_AVERAGE_POOL_2D:
       property.inputs = {{0, {}}};
@@ -85,6 +86,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_SPLIT:
       // We skip input 0 since it is the split dim which is not real valued.
@@ -143,6 +145,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}, {1, {}}};
       // Comparisons have no quantizable outputs.
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_EXPAND_DIMS:
       // We skip input 1 as it is not real valued (it's the index of axis) and
@@ -165,11 +168,13 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_HARD_SWISH: {
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 1;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_LOG_SOFTMAX: {
@@ -180,6 +185,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       tensor_property.restricted_value_int8 = {16.0 / 256.0, 127};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_LOGISTIC: {
@@ -736,6 +742,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
         property.restrict_scale = {{18, 0}};
         property.version = 2;
       }
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_L2_NORMALIZATION: {
@@ -746,6 +753,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       tensor_property.restricted_value_int8 = {1 / 128.0, 0};
       property.outputs = {{0, tensor_property}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_MAX_POOL_2D:
@@ -765,6 +773,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_MINIMUM:
       property.arbitrary_inputs = true;
@@ -791,6 +800,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_QUANTIZE:
       property.inputs = {{0, {}}};
@@ -802,11 +812,13 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_RELU_N1_TO_1:
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 1;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_RESHAPE:
       property.inputs = {{0, {}}};
@@ -820,6 +832,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_SHAPE:
       property.inputs = {{0, {}}};
@@ -866,6 +879,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantizable_int16 = false;
       break;
     case BuiltinOperator_TANH: {
       property.inputs = {{0, {}}};
@@ -899,6 +913,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
                          {3, tensor_property_bias}};
       property.outputs = {{0, {}}};
       property.version = 3;
+      property.quantizable_int16 = false;
       break;
     }
     case BuiltinOperator_TRANSPOSE:
@@ -916,6 +931,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
     default:
       // No quantized implementation exists for this operation.
       property.quantizable = false;
+      property.quantizable_int16 = false;
   }
   return property;
 }
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index 23052308568..151e314f335 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -65,7 +65,8 @@ struct TensorProperty {
 struct OperatorProperty {
   // Is a quantized operations currently supported.
   bool quantizable = true;
-
+  // Is a quantized operations currently supported for 16x8
+  bool quantizable_int16 = true;
   // Op has arbitrary number of inputs, such as concat.
   bool arbitrary_inputs = false;
   // Op has arbitrary number of outputs, such as slice.
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index bbb40080fbc..ceae3c29d9e 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -43,13 +43,17 @@ namespace {
 // operator_names.
 operator_property::OperatorProperty GetOperatorProperty(
     const std::unordered_set<string>& operator_names, const ModelT* model,
-    int subgraph_index, int op_idx, const string& operator_name) {
+    int subgraph_index, int op_idx, const string& operator_name,
+    const TensorType& activations_type) {
   operator_property::OperatorProperty property =
       operator_property::GetOperatorProperty(model, subgraph_index, op_idx);
   const OperatorT* op =
       model->subgraphs[subgraph_index]->operators[op_idx].get();
   const BuiltinOperator op_code =
       model->operator_codes[op->opcode_index]->builtin_code;
+  if (activations_type == TensorType_INT16 && !property.quantizable_int16) {
+    property.quantizable = false;
+  }
   // The algorithm adds Dequantize and Quantize, so we don't require them to be
   // in the operator_names.
   if (op_code != BuiltinOperator_DEQUANTIZE &&
@@ -320,9 +324,9 @@ TfLiteStatus ApplyConstraints(ModelT* model,
     // Iterate backward to avoid messing with index.
     for (int op_idx = subgraph->operators.size() - 1; op_idx >= 0; op_idx--) {
       OperatorT* op = subgraph->operators[op_idx].get();
-      operator_property::OperatorProperty property =
-          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
-                              subgraph->tensors[op->outputs[0]]->name);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, model, subgraph_idx, op_idx,
+          subgraph->tensors[op->outputs[0]]->name, activations_type);
       if (!property.quantizable) {
         continue;
       }
@@ -840,11 +844,17 @@ TfLiteStatus QuantizeWeightsInputOutput(
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property =
-          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
-                              subgraph->tensors[op->outputs[0]]->name);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, model, subgraph_idx, op_idx,
+          subgraph->tensors[op->outputs[0]]->name, activations_type);
 
-      if (!property.quantizable && !allow_float) {
+      if (activations_type == TensorType_INT16 && !property.quantizable &&
+          !allow_float) {
+        error_reporter->Report(
+            "Quantization to 16x8-bit not yet supported for op: %s",
+            EnumNameBuiltinOperator(op_code));
+        return kTfLiteError;
+      } else if (!property.quantizable && !allow_float) {
         error_reporter->Report("Quantization not yet supported for op: %s",
                                EnumNameBuiltinOperator(op_code));
         return kTfLiteError;
@@ -882,9 +892,9 @@ TfLiteStatus QuantizeBiases(ModelT* model,
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
           model->operator_codes[op->opcode_index]->builtin_code;
-      operator_property::OperatorProperty property =
-          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
-                              subgraph->tensors[op->outputs[0]]->name);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, model, subgraph_idx, op_idx,
+          subgraph->tensors[op->outputs[0]]->name, activations_type);
       if (!property.quantizable) {
         continue;
       }
@@ -951,15 +961,15 @@ std::unordered_set<string> GetAllOperatorOutputs(ModelT* model) {
 // will not be filled by this function.
 TfLiteStatus FillQuantizationParams(
     ModelT* model, const std::unordered_set<string>& operator_names,
-    ErrorReporter* error_reporter) {
+    const TensorType& activations_type, ErrorReporter* error_reporter) {
   for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
        subgraph_idx++) {
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
       OperatorT* op = subgraph->operators[op_idx].get();
-      operator_property::OperatorProperty property =
-          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
-                              subgraph->tensors[op->outputs[0]]->name);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, model, subgraph_idx, op_idx,
+          subgraph->tensors[op->outputs[0]]->name, activations_type);
 
       // Populate max, min for each input tensor.
       for (const std::pair<int, operator_property::TensorProperty>& input :
@@ -1048,9 +1058,9 @@ TfLiteStatus EnsureBiasScaleCompatibility(
     SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
       OperatorT* op = subgraph->operators[op_idx].get();
-      operator_property::OperatorProperty property =
-          GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
-                              subgraph->tensors[op->outputs[0]]->name);
+      operator_property::OperatorProperty property = GetOperatorProperty(
+          operator_names, model, subgraph_idx, op_idx,
+          subgraph->tensors[op->outputs[0]]->name, activations_type);
 
       // Loop over all bias tensors.
       for (const int bias_idx : property.biases) {
@@ -1174,8 +1184,8 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
                            const std::unordered_set<string>& operator_names,
                            const TensorType& activations_type,
                            ErrorReporter* error_reporter) {
-  TF_LITE_ENSURE_STATUS(
-      FillQuantizationParams(model, operator_names, error_reporter));
+  TF_LITE_ENSURE_STATUS(FillQuantizationParams(
+      model, operator_names, activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(EnsureBiasScaleCompatibility(
       model, operator_names, activations_type, error_reporter));
   TF_LITE_ENSURE_STATUS(
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index ef46b3fbd5d..b73cb9a79ca 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -1308,7 +1308,8 @@ TEST_F(QuantizeFCTest, VerifyFC) {
   EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
-class QuantizeCustomOpTest : public QuantizeModelTest {
+class QuantizeCustomOpTest : public QuantizeModelTest,
+    public ::testing::WithParamInterface<tflite::TensorType> {
  protected:
   QuantizeCustomOpTest() {
     input_model_ = ReadModel(internal::kModelMixed);
@@ -1317,10 +1318,10 @@ class QuantizeCustomOpTest : public QuantizeModelTest {
   }
 };
 
-TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
+TEST_P(QuantizeCustomOpTest, VerifyMixedQuantization) {
   auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8,
-                    /*allow_float=*/true, TensorType_INT8, &error_reporter_);
+      QuantizeModel(&builder_, &model_, GetParam(), GetParam(),
+                    /*allow_float=*/true, GetParam(), &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto float_graph = readonly_model_->subgraphs()->Get(0);
@@ -1334,8 +1335,45 @@ TEST_F(QuantizeCustomOpTest, VerifyMixedQuantization) {
       BuiltinOperator_CUSTOM,   BuiltinOperator_CUSTOM,
       BuiltinOperator_QUANTIZE, BuiltinOperator_SQUEEZE};
   const std::vector<TensorType> op_input_types = {
-      TensorType_INT8,    TensorType_INT8,    TensorType_FLOAT32,
-      TensorType_FLOAT32, TensorType_FLOAT32, TensorType_INT8};
+      GetParam(),    GetParam(),    TensorType_FLOAT32,
+      TensorType_FLOAT32, TensorType_FLOAT32, GetParam()};
+  for (int i = 0; i < subgraph->operators.size(); ++i) {
+    OperatorT* op = subgraph->operators[i].get();
+    ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
+              op_codes[i]);
+    ASSERT_EQ(subgraph->tensors[op->inputs[0]]->type, op_input_types[i]);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(QuantizeCustomOpTest, QuantizeCustomOpTest,
+                         ::testing::Values(TensorType_INT8, TensorType_INT16));
+
+class QuantizeOp16x8Test : public QuantizeModelTest {
+ protected:
+  QuantizeOp16x8Test() {
+    input_model_ = ReadModel(internal::kModelMixed16x8);
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
+  }
+};
+
+TEST_F(QuantizeOp16x8Test, VerifyMixedQuantization16x8) {
+  auto status =
+      QuantizeModel(&builder_, &model_, TensorType_INT16, TensorType_FLOAT32,
+                    /*allow_float=*/true, TensorType_INT16, &error_reporter_);
+  ASSERT_EQ(kTfLiteOk, status);
+  const auto& subgraph = model_.subgraphs[0];
+  auto float_graph = readonly_model_->subgraphs()->Get(0);
+  // The original model conv_2d->log_softmax
+  ASSERT_EQ(float_graph->operators()->size(), 2);
+  // The resulting model should be:
+  // conv_2d->dequantize->log_softmax
+  ASSERT_EQ(subgraph->operators.size(), 3);
+  const std::vector<BuiltinOperator> op_codes = {
+      BuiltinOperator_CONV_2D,  BuiltinOperator_DEQUANTIZE,
+      BuiltinOperator_LOG_SOFTMAX};
+  const std::vector<TensorType> op_input_types = {
+      TensorType_INT16,    TensorType_INT16,    TensorType_FLOAT32};
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
     ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc
index 7d5e9d65f06..379be64059f 100644
--- a/tensorflow/lite/tools/optimize/test_util.cc
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@@ -48,6 +48,7 @@ const char* kModelWithArgMaxOp = "argmax.bin";
 const char* kModelWithFCOp = "fc.bin";
 
 const char* kModelMixed = "mixed.bin";
+const char* kModelMixed16x8 = "mixed16x8.bin";
 
 const char* kModelSplit = "split.bin";
 
diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h
index abcdbc21d36..a49f3500288 100644
--- a/tensorflow/lite/tools/optimize/test_util.h
+++ b/tensorflow/lite/tools/optimize/test_util.h
@@ -76,6 +76,11 @@ extern const char* kModelWithFCOp;
 // reshape->custom->custom->squeeze.
 extern const char* kModelMixed;
 
+// Test model with mixed quantizable and
+// and un-quantizable ops for
+// activations in 16-bit.
+extern const char* kModelMixed16x8;
+
 // Test model with split op.
 extern const char* kModelSplit;
 
diff --git a/tensorflow/lite/tools/optimize/testdata/mixed16x8.bin b/tensorflow/lite/tools/optimize/testdata/mixed16x8.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c1f615e966eb164341d8ba4d56e1e359ef516388
GIT binary patch
literal 1184
zcmb7E&ubG=5T2wl2_@9f2FxK8g<hl~zYHR#?t>;sg<_<J(u0z2yBl2CBqiMx^`I0)
z1aErrp!f$Uo+2I;6nfA@K@Xm*7jNQ8@FK3?x7p1_Tj*Op=FN{cGjC?zW<;cMe({vZ
zq+}!_aY;y0Qjn%05gDUCB9aY>q<}Jbu!5~elfPXzj(rrE0@_xm+3R;4t8F&7%-*Ke
z?MMvr&%hhtIgkZrfDAxiYp&~((nm2@EawU$tB@JMEnjCt_u<}P&Y0ZMYI|w5Y_-(Y
z?}Kw!60=(O(}H?o7SzJdGcC3BTKlp4bnfJ<X?5|@m{xhYs7}m=)st`2+Wq)7^|2OF
zU#b~3-#V=-Q(J0p`>Z+^4yov`huYh4Sv}6>)b{(EDuQg76SInA$K1#dWu*K>jslH(
zwSK(X?clzSY2>`xBq2|rjzL8e;CF#{zyRRcFt7rofz-$rjJJkeV4bPw`5-oBT%<(G
zvtZ|9=p7fmc`I}VBn!|-a9(5_b}_Hl<DQv>0$%}T--8^vr<{wF@*}`kFb#nBTJh!c
ze_WR_X*1)~GGxq)vZ0^b)gcvcHmuG@-)WXcdio|9_W{lt{<*;4odf(8VSCxp9kbQ9
zFX=tUaxj6WQPpib-|iZw-7MzU(W*k84J*}}YnHz0`)5vJ%$p6$*g?FX2gIwu?lAH4
zOQvot>pe?fv(5U(=+2B6;4X4l1<x7;Ib4TqzZ$Z?hsgov|HK{kW5Sz4@&5u7u&V)_
wQ^rUDoHEWd-~4|0P<O#F@e^=vVEf-#^vU;6pL}Cc+?H=lN_=mz;T|RV3&^gwI{*Lx

literal 0
HcmV?d00001


From db0d4681210c831f505a455a0951fa37482184d6 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 17 Mar 2020 16:30:32 +0000
Subject: [PATCH 004/112] Corrected after merge with master. Tested: strict
 mode and non-strict mode.

Change-Id: I7e03d08133f39cc65a18875e65ce5cdddaf2d6a4
---
 tensorflow/lite/python/lite.py                | 62 ++++++++++++++++---
 .../optimize/calibration_wrapper_pybind11.cc  |  8 +--
 2 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 7e5f8ce704f..900398d7a6f 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -180,11 +180,13 @@ class QuantizationMode(object):
   def post_training_int8_no_float(self):
     """Post training int8 quantize, disallow float fallback."""
     return (self._is_int8_target_required() and
+            not self._is_int16x8_target_required() and
             self._representative_dataset is not None)
 
   def post_training_int8_allow_float(self):
     """Post training int8 quantize, allow float fallback."""
     return (self._any_optimization_enabled() and
+            not self._is_int16x8_target_required() and
             self._representative_dataset is not None and
             self._smallest_supported_type() == constants.INT8)
 
@@ -193,6 +195,18 @@ class QuantizationMode(object):
     return (self._any_optimization_enabled() and
             self._contains_training_quant_op())
 
+  def post_training_int16x8_no_float(self):
+    """Post training int16x8 quantize, disallow float fallback."""
+    return (not self._is_int8_target_required() and
+            self._is_int16x8_target_required() and
+            not self._is_allow_float() and
+            self._representative_dataset is not None)
+
+  def post_training_int16x8_allow_float(self):
+    """Post training int16x8 quantize, allow float fallback."""
+    return (self._is_int16x8_target_required() and
+            self._is_allow_float())
+
   def post_training_dynamic_range_int8(self):
     """Post training int8 const, on-the-fly int8 quantize of dynamic tensors."""
     # Post-training dynamic range quantization is only enabled if post-training
@@ -212,9 +226,14 @@ class QuantizationMode(object):
     return not (self.post_training_int8_no_float() or
                 self.post_training_int8_allow_float() or
                 self.training_time_int8_allow_float() or
+                self.post_training_int16x8_no_float() or
+                self.post_training_int16x8_allow_float() or
                 self.post_training_dynamic_range_int8() or
                 self.post_training_fp16())
 
+  def activations_type(self):
+    return constants.INT16 if self._is_int16x8_target_required() else constants.INT8
+
   # Below are helpers for the above functions.
 
   def _validate_int8_required(self):
@@ -244,6 +263,18 @@ class QuantizationMode(object):
         self._target_spec.supported_ops) or
             set(self._target_spec.supported_types) == set([constants.INT8]))
 
+  def _is_int16x8_target_required(self):
+    return bool(
+          set(self._target_spec.supported_ops).intersection([
+            OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]))
+
+  def _is_allow_float(self):
+    return bool(
+          set(self._target_spec.supported_ops).intersection([
+            OpsSet.TFLITE_BUILTINS
+        ]))
+
   def _any_optimization_enabled(self):
     return bool(
         set(self._optimizations).intersection([
@@ -309,13 +340,13 @@ class TFLiteConverterBase(object):
     return _get_grappler_config(optimizers)
 
   def _calibrate_quantize_model(self, result, inference_input_type,
-                                inference_output_type, allow_float):
+                                inference_output_type, activations_type, allow_float):
     if not isinstance(self.representative_dataset, RepresentativeDataset):
       self.representative_dataset = RepresentativeDataset(
           self.representative_dataset)
 
     calibrate_quantize = _calibrator.Calibrator(result)
-    activations_type = constants.INT16 if self._is_int16x8_target_required() else constants.INT8
+
     if (self.experimental_calibrate_only:
       return calibrate_quantize.calibrate(self.representative_dataset.input_gen)
     else:
@@ -608,12 +639,20 @@ class TFLiteConverterV2(TFLiteConverterBase):
         output_tensors=output_tensors,
         **converter_kwargs)
 
+    activations_type = quant_mode.activations_type()
+
     if quant_mode.post_training_int8_no_float():
       result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, False)
+                                              constants.FLOAT, activations_type, False)
     elif quant_mode.post_training_int8_allow_float():
       result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, True)
+                                              constants.FLOAT, activations_type, True)
+    elif quant_mode.post_training_int16x8_no_float():
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, activations_type, False)
+    elif quant_mode.post_training_int16x8_allow_float():
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, activations_type, True)
 
     return result
 
@@ -1114,6 +1153,8 @@ class TFLiteConverter(TFLiteConverterBase):
         quant_mode.post_training_int8_no_float() or
         quant_mode.post_training_int8_allow_float() or
         quant_mode.post_training_dynamic_range_int8() or
+        quant_mode.post_training_int16x8_no_float() or
+        quant_mode.post_training_int16x8_allow_float() or
         quant_mode.post_training_fp16())
     if post_training_optimize:
       # Post training optimizations require that TOCO outputs a float model.
@@ -1223,12 +1264,20 @@ class TFLiteConverter(TFLiteConverterBase):
           output_arrays=self._output_arrays,
           **converter_kwargs)
 
+    activations_type = quant_mode.activations_type()
+
     if quant_mode.post_training_int8_no_float():
       result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, False)
+                                              inference_output_type, activations_type, False)
     elif quant_mode.post_training_int8_allow_float():
       result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, True)
+                                              inference_output_type, activations_type, True)
+    elif quant_mode.post_training_int16x8_no_float():
+      result = self._calibrate_quantize_model(result, inference_input_type,
+                                              inference_output_type, activations_type, False)
+    elif quant_mode.post_training_int16x8_allow_float():
+      result = self._calibrate_quantize_model(result, inference_input_type,
+                                              inference_output_type, activations_type, True)
 
     return result
 
@@ -1334,7 +1383,6 @@ class TocoConverter(object):
 
   @classmethod
   @_deprecation.deprecated(
-      None, "Use `lite.TFLiteConverter.from_keras_model_file` instead.")
   def from_keras_model_file(cls,
                             model_file,
                             input_arrays=None,
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
index f56b23090b9..9a8fea5d1f6 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper_pybind11.cc
@@ -40,17 +40,17 @@ PYBIND11_MODULE(_pywrap_tensorflow_lite_calibration_wrapper, m) {
           })
       .def("QuantizeModel",
            [](CalibrationWrapper& self, int input_py_type, int output_py_type,
-              bool allow_float, bool enable_mlir_quantizer) {
+              bool allow_float, int activations_py_type, bool enable_mlir_quantizer) {
              return tensorflow::pyo_or_throw(
                  self.QuantizeModel(input_py_type, output_py_type, allow_float,
-                                    enable_mlir_quantizer));
+                                    activations_py_type, enable_mlir_quantizer));
            })
       .def("QuantizeModel",
            [](CalibrationWrapper& self, int input_py_type, int output_py_type,
-              bool allow_float) {
+              bool allow_float, int activations_py_type) {
              return tensorflow::pyo_or_throw(
                  self.QuantizeModel(input_py_type, output_py_type, allow_float,
-                                    /*enable_mlir_quantizer=*/false));
+                                    activations_py_type, /*enable_mlir_quantizer=*/false));
            })
       .def("QuantizeModel", [](CalibrationWrapper& self, int input_py_type,
                                int output_py_type, bool allow_float,

From 69ee4de053a14bdf883a0e6726bb2b374b71c973 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Fri, 27 Mar 2020 15:59:37 +0000
Subject: [PATCH 005/112] Fix for the broken 16-bit interface after latest
 changes to master.

---
 .../python/optimize/calibration_wrapper.cc    | 14 ++---
 .../lite/tools/optimize/quantize_model.cc     | 12 ++--
 .../lite/tools/optimize/quantize_model.h      | 12 ++--
 .../tools/optimize/quantize_model_test.cc     | 55 ++++++++++---------
 4 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index 660ea8d2d1b..ad82581bcba 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -233,12 +233,11 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
   reader_->AddCalibrationToModel(tflite_model.get(), /*update=*/false);
   flatbuffers::FlatBufferBuilder builder;
   auto status = kTfLiteOk;
-  
-  status = tflite::optimize::QuantizeModel(
-    &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
-    TfLiteTypeToSchemaType(output_type), allow_float,
-    TfLiteTypeToSchemaType(activations_type), error_reporter_.get());
-  }
+
+  status = tflite::optimize::QuantizeModelAllOperators(
+      &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
+      TfLiteTypeToSchemaType(output_type), allow_float,
+      TfLiteTypeToSchemaType(activations_type), error_reporter_.get());
 
   if (status != kTfLiteOk) {
     error_reporter_->exception();
@@ -269,8 +268,7 @@ PyObject* CalibrationWrapper::QuantizeModel(int input_py_type,
   auto status = tflite::optimize::QuantizeModel(
       &builder, tflite_model.get(), TfLiteTypeToSchemaType(input_type),
       TfLiteTypeToSchemaType(output_type), allow_float, {op_name},
-      TensorType_INT8,
-      error_reporter_.get());
+      TensorType_INT8, error_reporter_.get());
   if (status != kTfLiteOk) {
     error_reporter_->exception();
     return nullptr;
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 32cd2b8c25a..0892e7ae52a 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -1240,11 +1240,13 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
   return kTfLiteOk;
 }
 
-TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* model, const TensorType& input_type,
-                           const TensorType& output_type, bool allow_float,
-                           const TensorType& activations_type,
-                           ErrorReporter* error_reporter) {
+TfLiteStatus QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder* builder,
+                                       ModelT* model,
+                                       const TensorType& input_type,
+                                       const TensorType& output_type,
+                                       bool allow_float,
+                                       const TensorType& activations_type,
+                                       ErrorReporter* error_reporter) {
   return QuantizeModel(builder, model, input_type, output_type, allow_float,
                        GetAllOperatorOutputs(model), activations_type,
                        error_reporter);
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
index 06c30b88fd0..29f581d2b35 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.h
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -69,11 +69,13 @@ TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
 // could be TensorType_INT16 or TensorType_INT8.
 //
 // Note: This is a private API, subject to change.
-TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
-                           ModelT* model, const TensorType& input_type,
-                           const TensorType& output_type, bool allow_float,
-                           const TensorType& activations_type,
-                           ErrorReporter* error_reporter);
+TfLiteStatus QuantizeModelAllOperators(flatbuffers::FlatBufferBuilder* builder,
+                                       ModelT* model,
+                                       const TensorType& input_type,
+                                       const TensorType& output_type,
+                                       bool allow_float,
+                                       const TensorType& activations_type,
+                                       ErrorReporter* error_reporter);
 
 // Quantizes input_model and populates the provided builder with the new model
 // with all possible input parameters.
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index 0f780e4d3da..885fa98992c 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -190,9 +190,9 @@ TEST_P(QuantizeConvModelTest, OperatorsAreUnchanged) {
 }
 
 TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
-  auto status =
-      QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                    /*allow_float*/ false, tensor_type_, &error_reporter_);
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, tensor_type_, tensor_type_,
+      /*allow_float*/ false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   for (const auto& subgraph : model_.subgraphs) {
     for (const auto& tensor : subgraph->tensors) {
@@ -209,9 +209,9 @@ TEST_P(QuantizeConvModelTest, GraphIsFullyQuantized) {
 }
 
 TEST_P(QuantizeConvModelTest, FloatInputAndOutput) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
-                    /*allow_float*/ false, tensor_type_, &error_reporter_);
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, TensorType_FLOAT32, TensorType_FLOAT32,
+      /*allow_float*/ false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   for (int32_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
@@ -384,8 +384,9 @@ class QuantizeConcatModelTest : public QuantizeModelTest,
 //                                       concat - output
 //                              input1 /
 TEST_P(QuantizeConcatModelTest, AddRequantBeforeConcat) {
-  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                              false, tensor_type_, &error_reporter_);
+  auto status =
+      QuantizeModelAllOperators(&builder_, &model_, tensor_type_, tensor_type_,
+                                false, tensor_type_, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
 
   // There is only one subgraph.
@@ -549,9 +550,9 @@ class QuantizeConvModel1Test : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT8, TensorType_INT8, false,
-                    TensorType_INT8, &error_reporter_);
+  auto status = QuantizeModelAllOperators(&builder_, &model_, TensorType_INT8,
+                                          TensorType_INT8, false,
+                                          TensorType_INT8, &error_reporter_);
   EXPECT_EQ(status, kTfLiteOk);
   const auto& subgraph = model_.subgraphs[0];
 
@@ -658,8 +659,9 @@ INSTANTIATE_TEST_SUITE_P(QuantizeConvModel2TestInst, QuantizeConvModel2Test,
                                             TensorType_INT16}));
 
 TEST_P(QuantizeConvModel2Test, VerifyConvQuantization) {
-  auto status = QuantizeModel(&builder_, &model_, tensor_type_, tensor_type_,
-                              false, tensor_type_, &error_reporter_);
+  auto status =
+      QuantizeModelAllOperators(&builder_, &model_, tensor_type_, tensor_type_,
+                                false, tensor_type_, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto conv_op = subgraph->operators[0].get();
@@ -1308,8 +1310,9 @@ TEST_F(QuantizeFCTest, VerifyFC) {
   EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
-class QuantizeCustomOpTest : public QuantizeModelTest,
-    public ::testing::WithParamInterface<tflite::TensorType> {
+class QuantizeCustomOpTest
+    : public QuantizeModelTest,
+      public ::testing::WithParamInterface<tflite::TensorType> {
  protected:
   QuantizeCustomOpTest() {
     input_model_ = ReadModel(internal::kModelMixed);
@@ -1319,9 +1322,9 @@ class QuantizeCustomOpTest : public QuantizeModelTest,
 };
 
 TEST_P(QuantizeCustomOpTest, VerifyMixedQuantization) {
-  auto status =
-      QuantizeModel(&builder_, &model_, GetParam(), GetParam(),
-                    /*allow_float=*/true, GetParam(), &error_reporter_);
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, GetParam(), GetParam(),
+      /*allow_float=*/true, GetParam(), &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto float_graph = readonly_model_->subgraphs()->Get(0);
@@ -1335,7 +1338,7 @@ TEST_P(QuantizeCustomOpTest, VerifyMixedQuantization) {
       BuiltinOperator_CUSTOM,   BuiltinOperator_CUSTOM,
       BuiltinOperator_QUANTIZE, BuiltinOperator_SQUEEZE};
   const std::vector<TensorType> op_input_types = {
-      GetParam(),    GetParam(),    TensorType_FLOAT32,
+      GetParam(),         GetParam(),         TensorType_FLOAT32,
       TensorType_FLOAT32, TensorType_FLOAT32, GetParam()};
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
@@ -1358,9 +1361,9 @@ class QuantizeOp16x8Test : public QuantizeModelTest {
 };
 
 TEST_F(QuantizeOp16x8Test, VerifyMixedQuantization16x8) {
-  auto status =
-      QuantizeModel(&builder_, &model_, TensorType_INT16, TensorType_FLOAT32,
-                    /*allow_float=*/true, TensorType_INT16, &error_reporter_);
+  auto status = QuantizeModelAllOperators(
+      &builder_, &model_, TensorType_INT16, TensorType_FLOAT32,
+      /*allow_float=*/true, TensorType_INT16, &error_reporter_);
   ASSERT_EQ(kTfLiteOk, status);
   const auto& subgraph = model_.subgraphs[0];
   auto float_graph = readonly_model_->subgraphs()->Get(0);
@@ -1369,11 +1372,11 @@ TEST_F(QuantizeOp16x8Test, VerifyMixedQuantization16x8) {
   // The resulting model should be:
   // conv_2d->dequantize->log_softmax
   ASSERT_EQ(subgraph->operators.size(), 3);
-  const std::vector<BuiltinOperator> op_codes = {
-      BuiltinOperator_CONV_2D,  BuiltinOperator_DEQUANTIZE,
-      BuiltinOperator_LOG_SOFTMAX};
+  const std::vector<BuiltinOperator> op_codes = {BuiltinOperator_CONV_2D,
+                                                 BuiltinOperator_DEQUANTIZE,
+                                                 BuiltinOperator_LOG_SOFTMAX};
   const std::vector<TensorType> op_input_types = {
-      TensorType_INT16,    TensorType_INT16,    TensorType_FLOAT32};
+      TensorType_INT16, TensorType_INT16, TensorType_FLOAT32};
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
     ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,

From 2759cdca671dee4b7a2035710cf08725a97ce73c Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 7 Apr 2020 16:25:36 +0100
Subject: [PATCH 006/112] Fix after merging with master.

---
 tensorflow/lite/python/optimize/calibrator.py      | 4 ++--
 tensorflow/lite/python/optimize/calibrator_test.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index e31983b834e..90c43fcddfa 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -60,8 +60,8 @@ class Calibrator(object):
                              input_type,
                              output_type,
                              allow_float,
-                             resize_input=True,
-                             activations_type = lite_constants.INT8):
+                             activations_type=lite_constants.INT8,
+                             resize_input=True):
     """Calibrates the model with specified generator and then quantizes it.
 
     The input shapes of the calibrator are resized with the calibration data if
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index f8a1171a629..f778c8a555d 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -148,7 +148,9 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaisesRegex(ValueError, 'Size mismatch'):
       quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, False)
+                                       constants.FLOAT, False,
+                                       constants.INT8,
+                                       False)
 
   def test_invalid_type_calibrator_gen(self):
     model_path = resource_loader.get_path_to_datafile(

From e5d5522d827e1c60b3ac830000b4489206480f95 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 4 May 2020 14:12:51 +0100
Subject: [PATCH 007/112] Both inputs should be in int16 for MUL operator. Some
 networks have one of inputs as a constant.

---
 tensorflow/lite/tools/optimize/operator_property.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 38c34706fbe..2ffe9fa3671 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -807,6 +807,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
     case BuiltinOperator_MUL:
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
+      property.quantize_input_as_activations = true;
       property.version = 2;
       break;
     case BuiltinOperator_PACK:

From cbf60b5223997eceb6c4221ef4868fd6c792622c Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 3 Jun 2020 12:38:44 +0100
Subject: [PATCH 008/112] Addressed reviewer's comments.

Change-Id: I3b7842c42b8c905ed44e0cd556134210cb45479c
---
 tensorflow/lite/python/lite.py | 37 ++++++++++++++++------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 92c0d5a95d9..010952820b9 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -258,7 +258,7 @@ class QuantizationMode(object):
 
     if self.training_time_int8_allow_float():
       return {
-          "inference_type": inference_ty if inference_ty else constants.INT8,
+          "inference_type": inference_ty if inference_ty else self.activations_type(),
           "inference_input_type":
               inference_input_ty if inference_input_ty else constants.FLOAT,
           "post_training_quantize": False,  # disable dynamic range quantization
@@ -297,12 +297,28 @@ class QuantizationMode(object):
       return True, {
           "inference_input_type": inference_input_type,
           "inference_output_type": inference_output_type,
+          "activations_type": constants.INT8,
           "allow_float": False
       }
     elif self.post_training_int8_allow_float():
       return True, {
           "inference_input_type": inference_input_type,
           "inference_output_type": inference_output_type,
+          "activations_type": constants.INT8,
+          "allow_float": True
+      }
+    elif self.post_training_int16x8_no_float():
+      return True, {
+          "inference_input_type": inference_input_type,
+          "inference_output_type": inference_output_type,
+          "activations_type": constants.INT16,
+          "allow_float": False
+      }
+    elif self.post_training_int16x8_allow_float():
+      return True, {
+          "inference_input_type": inference_input_type,
+          "inference_output_type": inference_output_type,
+          "activations_type": constants.INT16,
           "allow_float": True
       }
     else:
@@ -573,25 +589,6 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
         output_tensors=output_tensors,
         **converter_kwargs)
 
-    activations_type = quant_mode.activations_type()
-
-    if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, activations_type,
-                                              False)
-    elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, activations_type,
-                                              True)
-    elif quant_mode.post_training_int16x8_no_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, activations_type,
-                                              False)
-    elif quant_mode.post_training_int16x8_allow_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, activations_type,
-                                              True)
-
     calibrate_and_quantize, flags = quant_mode.quantizer_flags()
     if calibrate_and_quantize:
       result = self._calibrate_quantize_model(result, **flags)

From 67ea57b15bb223e72a60265c24082ae5a31d0f0e Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 3 Jun 2020 15:57:18 +0100
Subject: [PATCH 009/112] Small fix for inference.

Change-Id: Ifd8670ccb9604ecced3d013f529ddbe16fcd75cf
---
 tensorflow/lite/python/util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 32a2d596629..b56b9f49b7a 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -49,6 +49,7 @@ _MAP_TF_TO_TFLITE_TYPES = {
     dtypes.string: _types_pb2.STRING,
     dtypes.uint8: _types_pb2.QUANTIZED_UINT8,
     dtypes.int8: _types_pb2.INT8,
+    dtypes.int16: _types_pb2.QUANTIZED_INT16,
     dtypes.complex64: _types_pb2.COMPLEX64,
     dtypes.bool: _types_pb2.BOOL,
 }

From 29fdee8e85e750d04f6e9d378e85443ba5c7a239 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 3 Jun 2020 17:50:28 +0100
Subject: [PATCH 010/112] Fix for error_reporter.

Change-Id: I58745cc97872af74b1ad5b0af3ad778b39f01555
---
 .../lite/tools/optimize/quantization_utils.cc    |  3 ++-
 tensorflow/lite/tools/optimize/quantize_model.cc | 16 +++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index abbcb642287..cdf2743585e 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -113,7 +113,8 @@ TfLiteStatus GetQuantizationParams(TensorT* tensor, TensorType activations_type,
                                    tensor->quantization->max[0],
                                    quantized_range, quantization_params);
   } else {
-    error_reporter->Report(
+    TF_LITE_REPORT_ERROR(
+        error_reporter,
         "Unsupported activation type for quantize-activation: %s",
         activations_type);
     return kTfLiteError;
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index 6dd8ddd2d8c..0cf69eee3b4 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -370,9 +370,9 @@ TfLiteStatus ApplyConstraints(ModelT* model,
         std::unique_ptr<TensorT> additional_tensor;
         const string requant_tensor_name = input_tensor->name + "_requantized";
         utils::MakeTensorWithQuantParam(
-            requant_tensor_name, input_tensor->shape, 
-            input_tensor->shape_signature, activations_type,
-            output_scale, output_zp, &additional_tensor);
+            requant_tensor_name, input_tensor->shape,
+            input_tensor->shape_signature, activations_type, output_scale,
+            output_zp, &additional_tensor);
         const int32_t additional_tensor_idx = subgraph->tensors.size();
         subgraph->tensors.push_back(std::move(additional_tensor));
 
@@ -869,13 +869,15 @@ TfLiteStatus QuantizeWeightsInputOutput(
 
       if (activations_type == TensorType_INT16 && !property.quantizable &&
           !allow_float) {
-        error_reporter->Report(
-            "Quantization to 16x8-bit not yet supported for op: %s",
+        TF_LITE_REPORT_ERROR(
+            error_reporter,
+            "Quantization to 16x8-bit not yet supported for op: %",
             EnumNameBuiltinOperator(op_code));
         return kTfLiteError;
       } else if (!property.quantizable && !allow_float) {
-        error_reporter->Report("Quantization not yet supported for op: %s",
-                               EnumNameBuiltinOperator(op_code));
+        TF_LITE_REPORT_ERROR(error_reporter,
+                             "Quantization not yet supported for op: %",
+                             EnumNameBuiltinOperator(op_code));
         return kTfLiteError;
       }
 

From 761d850ac6456aed93ab250ff49af3f0a6a62960 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 3 Jun 2020 17:56:39 +0100
Subject: [PATCH 011/112] Renamed option with the prefix EXPERIMENTAL_.

Change-Id: Idb84736507d5c07ebdf182b8a15d55906d0d7fc0
---
 tensorflow/lite/python/convert.py   | 2 +-
 tensorflow/lite/python/lite.py      | 2 +-
 tensorflow/lite/python/lite_test.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index c30987a5898..939de61c608 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -98,7 +98,7 @@ class OpsSet(enum.Enum):
   # and int16 activations.
   # Specifying this will throw an error for operations that do not yet have
   # quantized implementations.
-  TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = "TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+  EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
 
   def __str__(self):
     return self.value
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 010952820b9..781007241b4 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -356,7 +356,7 @@ class QuantizationMode(object):
   def _is_int16x8_target_required(self):
     return bool(
           set(self._target_spec.supported_ops).intersection([
-            OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+            OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
         ]))
 
   def _is_allow_float(self):
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 4075a887943..1d052b88c10 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -885,7 +885,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
       # Quantize model to Int8: with disable mlir
       ('UseTfliteBuiltinsIntDisableMLIR', [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
       # Quantize model to Int16: with disable mlir
-      ('UseTfliteBuiltinsInt16DisableMLIR', [lite.OpsSet.TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8], False))
+      ('UseTfliteBuiltinsInt16DisableMLIR', [lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8], False))
   def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()

From ab6b2ffde37bf4443c7dadc39312f0429f417db6 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 15 May 2020 11:40:22 -0500
Subject: [PATCH 012/112] Removed TENSOR_OP disable env vars.

* TF_DISABLE_CUBLAS_TENSOR_OP_MATH
* TF_DISABLE_CUDNN_TENSOR_OP_MATH
* TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH
---
 tensorflow/stream_executor/cuda/cuda_blas.cc | 21 ++------
 tensorflow/stream_executor/cuda/cuda_dnn.cc  | 55 +++++---------------
 2 files changed, 16 insertions(+), 60 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index c9f0fc462c9..65c07e72154 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -101,18 +101,6 @@ static std::string ToString(cublasStatus_t status) {
   }
 }
 
-// Decide whether to enable TENSOR_OP_MATH
-static bool TensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
 // cuBLAS has interfaces that permit pointers to be passed from either the host
 // memory space or the device memory space; however, you must instruct it as to
 // which address space those pointers are in with cublasSetPointerMode.
@@ -1640,7 +1628,7 @@ bool CUDABlas::DoBlasGemm(
                                                                    &cc_minor);
 
   // GPUs < sm_70 don't support tensor ops.
-  if (cc_major >= 7 && TensorOpMathEnabled()) {
+  if (cc_major >= 7) {
     use_tensor_ops = true;
   }
 #endif
@@ -1921,8 +1909,7 @@ static bool TensorOpsAvailable(int cc_major) {
   // strictly correct.  We can't simply enable it, though, as that would change
   // clients' behavior significantly: Using tensor ops on fp32 inputs cause them
   // to be rounded to fp16.
-  if (cc_major >= 7 && TensorOpMathEnabled() &&
-      std::is_same<InType, Eigen::half>::value) {
+  if (cc_major >= 7 && std::is_same<InType, Eigen::half>::value) {
     return true;
   }
 #endif
@@ -2270,7 +2257,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor) &&
       cc_major >= 5) {
-    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
+    bool use_tensor_ops = data_type == CUDA_R_16F;
     cublasGemmAlgo_t algo =
         (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
     cudaDataType_t compute_type =
@@ -2425,7 +2412,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor)) {
     // GPUs < sm_70 don't support tensor ops.
-    if (cc_major >= 7 && TensorOpMathEnabled()) {
+    if (cc_major >= 7) {
       use_tensor_ops = true;
     }
 #if CUDA_VERSION >= 9010
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 6122877f91f..780f1475c2c 100755
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -603,31 +603,6 @@ class CudnnFilterDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
 };
 
-// A helper function to decide whether to enable the TENSOR_OP_MATH math type
-bool TensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled = false;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
-// A helper function to decide whether to enable the TENSOR_OP_MATH math type
-// for RNNs.
-bool RnnTensorOpMathEnabled() {
-  static bool is_enabled = [] {
-    bool is_disabled = false;
-    TF_CHECK_OK(
-        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH",
-                                       /*default_val=*/false, &is_disabled));
-    return !is_disabled;
-  }();
-  return is_enabled;
-}
-
 // A helper function to decide whether to use
 // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
 // some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
@@ -751,9 +726,7 @@ class CudnnConvolutionDescriptor {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
-    if (TensorOpMathEnabled()) {
-      CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
-    }
+    CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
 #endif
   }
 
@@ -1157,21 +1130,19 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // in profile mode, which is run with algorithms returned from
     // GetRnnAlgorithms() (which are non-default and explicitly set whether to
     // use tensor ops). CuDNN 7.2.1 fixed this issue
-    if (RnnTensorOpMathEnabled()) {
-      cudnnMathType_t math_type;
-      if (algorithm_config.algorithm().has_value()) {
-        math_type = algorithm_config.algorithm()->tensor_ops_enabled()
-                        ? CUDNN_TENSOR_OP_MATH
-                        : CUDNN_DEFAULT_MATH;
-      } else {
+    cudnnMathType_t math_type;
+    if (algorithm_config.algorithm().has_value()) {
+      math_type = algorithm_config.algorithm()->tensor_ops_enabled()
+                      ? CUDNN_TENSOR_OP_MATH
+                      : CUDNN_DEFAULT_MATH;
+    } else {
 #if CUDNN_VERSION >= 7201
-        math_type = CUDNN_TENSOR_OP_MATH;
+      math_type = CUDNN_TENSOR_OP_MATH;
 #else
-        math_type = CUDNN_DEFAULT_MATH;
+      math_type = CUDNN_DEFAULT_MATH;
 #endif  // CUDNN_VERSION >= 7201
-      }
-      CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
     }
+    CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000
 
     return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
@@ -2605,7 +2576,7 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
 }
 
 static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
+  return cc_major >= 7 && CUDNN_VERSION >= 7000;
 }
 
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
@@ -3399,9 +3370,7 @@ bool CudnnSupport::GetRnnAlgorithms(
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
 #if CUDNN_VERSION >= 7100
-    if (RnnTensorOpMathEnabled()) {
-      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
-    }
+    out_algorithms->push_back({i, /*use_tensor_ops=*/true});
 #endif
   }
   return true;

From 3cad62e356b8c72d03af13ba29f7ace29a6f0772 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 15 May 2020 11:46:41 -0500
Subject: [PATCH 013/112] Add global setting control TF32 execution

---
 tensorflow/core/platform/BUILD         |  7 +++++++
 tensorflow/core/platform/tf32_utils.cc | 27 ++++++++++++++++++++++++++
 tensorflow/core/platform/tf32_utils.h  | 27 ++++++++++++++++++++++++++
 3 files changed, 61 insertions(+)
 create mode 100644 tensorflow/core/platform/tf32_utils.cc
 create mode 100644 tensorflow/core/platform/tf32_utils.h

diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index c7ff378d2ac..f27d2f09208 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -937,6 +937,13 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "tf32_utils",
+    srcs = ["tf32_utils.cc"],
+    hdrs = ["tf32_utils.h"],
+    copts = tf_copts(),
+)
+
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tf32_utils.cc
new file mode 100644
index 00000000000..715b5996dc3
--- /dev/null
+++ b/tensorflow/core/platform/tf32_utils.cc
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/tf32_utils.h"
+
+namespace tensorflow {
+
+// TODO(nluehr): enable tf32 execution by default after TF32 Ampere testing.
+static bool tf32_enabled = false;
+
+void allow_tf32_execution(bool allow) { tf32_enabled = allow; }
+
+bool tf32_execution_allowed() { return tf32_enabled; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tf32_utils.h
new file mode 100644
index 00000000000..a0ce58f9bbd
--- /dev/null
+++ b/tensorflow/core/platform/tf32_utils.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+
+namespace tensorflow {
+
+void allow_tf32_execution(bool allow);
+
+bool tf32_execution_allowed();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_

From 8bfee17f5880eccdb759fb47ab11b782f201cf0f Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Fri, 15 May 2020 13:33:02 -0500
Subject: [PATCH 014/112] Python tf.config tf32 interface

---
 tensorflow/python/BUILD               | 11 +++++++++++
 tensorflow/python/framework/config.py | 26 ++++++++++++++++++++++++++
 tensorflow/python/util/tf32.cc        | 22 ++++++++++++++++++++++
 3 files changed, 59 insertions(+)
 create mode 100644 tensorflow/python/util/tf32.cc

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a49e4b74def..997ec6c924f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -746,6 +746,16 @@ tf_python_pybind_extension(
     ],
 )
 
+tf_python_pybind_extension(
+    name = "_pywrap_tf32_execution",
+    srcs = ["util/tf32.cc"],
+    module_name = "_pywrap_tf32_execution",
+    deps = [
+        "//tensorflow/core/platform:tf32_utils",
+        "@pybind11",
+    ],
+)
+
 tf_python_pybind_extension(
     name = "_pywrap_util_port",
     srcs = ["util/port_wrapper.cc"],
@@ -5573,6 +5583,7 @@ py_library(
         "//tensorflow:composite_tensor_whitelist",
     ],
     deps = [
+        ":_pywrap_tf32_execution",
         ":tf_decorator",
         ":tf_export",
         ":tf_stack",
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 5361d7290e8..042af4d1023 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -18,10 +18,36 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import _pywrap_tf32_execution
 from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
+def tensor_float32_execution_allowed():
+  """Get if TensorFloat-32 operations are enabled on supported hardware.
+
+  Returns:
+    True if TensorFloat-32 execution is enabled and False otherwise.
+  """
+  return _pywrap_tf32_execution.is_allowed()
+
+def allow_tensor_float_32_execution(allow):
+  """Allow use of TensorFloat-32 with float32 ops on supported hardware.
+
+  TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
+  TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
+  Internally, the inputs are cast to a custom representation with 10-bit
+  mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
+  executed using TensorCores with float32 accumulation. For more information,
+  see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
+
+  TensorFloat-32 execution is disabled by default, but this may change in a
+  future version.
+  
+  Args:
+    allow: whether to allow TensorFloat-32 execution
+  """
+  _pywrap_tf32_execution.allow(allow)
 
 @tf_export('config.threading.get_intra_op_parallelism_threads')
 def get_intra_op_parallelism_threads():
diff --git a/tensorflow/python/util/tf32.cc b/tensorflow/python/util/tf32.cc
new file mode 100644
index 00000000000..7dece6ccdae
--- /dev/null
+++ b/tensorflow/python/util/tf32.cc
@@ -0,0 +1,22 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/core/platform/tf32_utils.h"
+
+PYBIND11_MODULE(_pywrap_tf32_execution, m) {
+  m.def("allow", &tensorflow::allow_tf32_execution);
+  m.def("is_allowed", &tensorflow::tf32_execution_allowed);
+}

From dedb51aec2a766bdeb8b4c2ab1700bfcf7687966 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Tue, 19 May 2020 14:58:30 -0500
Subject: [PATCH 015/112] Convolution TF32 Plumbing

---
 tensorflow/stream_executor/cuda/BUILD       |   1 +
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 200 +++++++++++++-------
 2 files changed, 135 insertions(+), 66 deletions(-)
 mode change 100755 => 100644 tensorflow/stream_executor/cuda/cuda_dnn.cc

diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 1457a36beaf..2749281335e 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -353,6 +353,7 @@ cc_library(
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor:dnn",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
old mode 100755
new mode 100644
index 780f1475c2c..53296f4eea5
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/str_cat.h"
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "third_party/eigen3/Eigen/Core"
 // clang-format off
 #include "third_party/gpus/cudnn/cudnn.h"
 #include "absl/strings/string_view.h"
@@ -707,10 +708,6 @@ class CudnnConvolutionDescriptor {
             : CUDNN_CROSS_CORRELATION,
         data_type));
 
-    // NOTE(benbarsdell): This only applies if tensor op math is enabled
-    //                      and algo selection is set to Default.
-    this->set_use_tensor_op_math(true);
-
 #if CUDNN_MAJOR >= 7
     VLOG(2) << "Requesting grouped convolution: "
             << convolution_descriptor.group_count();
@@ -722,10 +719,14 @@ class CudnnConvolutionDescriptor {
 #endif
   }
 
-  void set_use_tensor_op_math(bool use_tensor_op_math) const {
+  void set_use_tensor_op_math(bool use_tensor_op_math) {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
+#if CUDNN_VERSION >= 8000
+        (use_tensor_op_math ? CUDNN_DEFAULT_MATH : CUDNN_FMA_MATH);
+#else
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
+#endif
     CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
 #endif
   }
@@ -738,6 +739,38 @@ class CudnnConvolutionDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
 };
 
+// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
+// set
+static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
+  cudnnMathType_t math_type;
+  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
+#if CUDNN_VERSION >= 8000
+  return math_type != CUDNN_FMA_MATH;
+#else
+  return math_type == CUDNN_TENSOR_OP_MATH;
+#endif
+}
+
+static bool TensorOpMathAvailable(int cc_major) {
+  return cc_major >= 7 && CUDNN_VERSION >= 7000;
+}
+
+static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
+  int cc_major, cc_minor;
+  std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+  if (!TensorOpMathAvailable(cc_major)) {
+    return false;
+  }
+  if (input_type == dnn::DataType::kFloat) {
+    if (CUDNN_VERSION < 8000) {
+      return false;
+    } else if (!tensorflow::tf32_execution_allowed()) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
 // within a scope.
 class CudnnPoolingDescriptor {
@@ -2450,10 +2483,11 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2493,10 +2527,11 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2538,10 +2573,11 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  // TODO(csigg): This has side effects on the convolution descriptor. It is
-  // functionally correct because the convolution is run with the algorithm of
-  // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
+    return port::Status(
+        port::error::INTERNAL,
+        "Mismatch between cudnn conv and algorithm descriptors.");
+  }
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2575,18 +2611,39 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
   return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
-static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000;
+port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
+                                  absl::optional<dnn::AlgorithmDesc> desc) {
+  bool use_tensor_ops;
+  if (desc.has_value()) {
+    use_tensor_ops = desc->tensor_ops_enabled();
+    if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "Algo requests disallowed tensor op evaluation.");
+    }
+  } else {
+    use_tensor_ops = IsTensorMathAllowed(stream, type);
+  }
+  return use_tensor_ops;
 }
 
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2599,10 +2656,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                         GetCudnnConvolutionForwardAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
@@ -2626,6 +2680,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                      "Returned status: ", scratch_or.status().ToString()));
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2636,10 +2693,19 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2652,10 +2718,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
                         GetCudnnConvolutionBackwardDataAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
@@ -2678,6 +2741,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2688,10 +2754,19 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    const CudnnConvolutionDescriptor& conv,
+    dnn::DataType element_type,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+  bool use_tensor_ops;
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
+
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2704,10 +2779,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
                         GetCudnnConvolutionBackwardFilterAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    int cc_major, cc_minor;
-    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-    algo_desc = dnn::AlgorithmDesc(
-        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
+    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
   }
 
   auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
@@ -2730,6 +2802,9 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
+  SE_ASSIGN_OR_RETURN(use_tensor_ops,
+                      UseTensorOps(stream, element_type, algo_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2894,35 +2969,32 @@ port::Status CudnnSupport::DoPrepareForConvolution(
   CudnnTensorDescriptor output_nd(
       output_descriptor,
       ToCudnnDataType(element_type, output_descriptor.layout()));
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   switch (kind) {
     case dnn::ConvolutionKind::FORWARD: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionForwardAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionForwardAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_DATA: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionBackwardDataAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionBackwardDataAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_FILTER: {
-      SE_ASSIGN_OR_RETURN(
-          *algorithm_desc,
-          GetCudnnConvolutionBackwardFilterAlgorithm(
-              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
-              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(*algorithm_desc,
+                          GetCudnnConvolutionBackwardFilterAlgorithm(
+                              stream, cudnn, algorithm_config, input_nd,
+                              filter_nd, element_type, convolution_descriptor,
+                              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     default:
@@ -2951,8 +3023,9 @@ port::Status CudnnSupport::DoConvolve(
   auto accumulator_type = GetConvAccumulatorType(element_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
                                   ToCudnnDataType(accumulator_type));
-  // Set use_tensor_math param to correct value
-  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+  SE_ASSIGN_OR_RETURN(bool use_tensor_ops,
+                      UseTensorOps(stream, element_type, algorithm_desc));
+  conv.set_use_tensor_op_math(use_tensor_ops);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -3185,14 +3258,6 @@ port::Status CudnnSupport::DoConvolve(
   return port::Status::OK();
 }
 
-// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
-// set
-static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
-  cudnnMathType_t math_type;
-  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
-  return math_type == CUDNN_TENSOR_OP_MATH;
-}
-
 template <typename ElementType, typename BiasType, typename ScaleType,
           typename OutputType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
@@ -3226,8 +3291,6 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       filter_descriptor,
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
@@ -3237,9 +3300,14 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
   SE_ASSIGN_OR_RETURN(
       dnn::AlgorithmDesc algo_desc,
       GetCudnnConvolutionForwardAlgorithm(
-          stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
+          stream, cudnn, algorithm_config, conv_input_nd, filter,
+          dnn::ToDataType<ElementType>::value, convolution_descriptor,
           output_nd, scratch_allocator, &scratch));
 
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  ToCudnnDataType(accumulator_type));
+  conv.set_use_tensor_op_math(algo_desc.tensor_ops_enabled());
+
   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new GpuTimer(parent_));  // NOLINT

From 0f58bb63090222cef0eebe74630b2d4d9d886a2f Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Tue, 19 May 2020 15:54:10 -0500
Subject: [PATCH 016/112] Plumb TF32 for RNN

---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 30 ++++++++++++++-------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 53296f4eea5..fa06d410323 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1163,17 +1163,26 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // in profile mode, which is run with algorithms returned from
     // GetRnnAlgorithms() (which are non-default and explicitly set whether to
     // use tensor ops). CuDNN 7.2.1 fixed this issue
-    cudnnMathType_t math_type;
+    bool allow_tensor_ops =
+        data_type != CUDNN_DATA_FLOAT || tensorflow::tf32_execution_allowed();
+    bool use_tensor_ops;
     if (algorithm_config.algorithm().has_value()) {
-      math_type = algorithm_config.algorithm()->tensor_ops_enabled()
-                      ? CUDNN_TENSOR_OP_MATH
-                      : CUDNN_DEFAULT_MATH;
+      use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled();
     } else {
-#if CUDNN_VERSION >= 7201
-      math_type = CUDNN_TENSOR_OP_MATH;
-#else
-      math_type = CUDNN_DEFAULT_MATH;
-#endif  // CUDNN_VERSION >= 7201
+      use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops;
+    }
+
+    if (use_tensor_ops && !allow_tensor_ops) {
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "Algo requests disallowed tensor op evaluation.");
+    }
+
+    cudnnMathType_t math_type;
+    if (use_tensor_ops) {
+      math_type =
+          CUDNN_VERSION >= 8000 ? CUDNN_DEFAULT_MATH : CUDNN_TENSOR_OP_MATH;
+    } else {
+      math_type = CUDNN_VERSION >= 8000 ? CUDNN_FMA_MATH : CUDNN_DEFAULT_MATH;
     }
     CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000
@@ -2626,6 +2635,9 @@ port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
   return use_tensor_ops;
 }
 
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
+dnn::DataType GetConvAccumulatorType(dnn::DataType data_type);
+
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,

From b67608e66c54224fa52200095fba09df0f2b3c71 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Wed, 20 May 2020 10:06:35 -0500
Subject: [PATCH 017/112] Plumb TF32 for cublas gemm

---
 tensorflow/stream_executor/cuda/BUILD        |  1 +
 tensorflow/stream_executor/cuda/cuda_blas.cc | 84 +++++++++-----------
 tensorflow/stream_executor/cuda/cuda_blas.h  |  8 +-
 3 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 2749281335e..519033a62d8 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -251,6 +251,7 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:host_or_device_scalar",
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 65c07e72154..e2cbb0b75df 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -48,7 +48,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@@ -66,6 +66,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"
+#include "third_party/eigen3/Eigen/Core"
 
 namespace stream_executor {
 namespace gpu {
@@ -225,6 +226,18 @@ bool CUDABlas::Init() {
     return false;
   }
 
+#if CUDA_VERSION >= 9000
+#if CUBLAS_VER_MAJOR >= 11
+  ret = cublasSetMathMode(blas_, CUBLAS_TF32_TENSOR_OP_MATH);
+#else
+  ret = cublasSetMathMode(blas_, CUBLAS_TENSOR_OP_MATH);
+#endif
+  if (ret != CUBLAS_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set cublas default math mode: " << ToString(ret);
+    return false;
+  }
+#endif
+
   return true;
 }
 
@@ -387,7 +400,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
 template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
-                                  bool use_tensor_op_math, Args... args) {
+                                  Args... args) {
   absl::MutexLock lock(&mu_);
 
   CHECK(blas_ != nullptr);
@@ -401,10 +414,10 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                            : CUBLAS_POINTER_MODE_DEVICE)) {
     return false;
   }
-#if CUDA_VERSION >= 9000
+#if CUBLAS_VER_MAJOR >= 11
   ScopedCublasMathMode math_mode{blas_};
-  if (use_tensor_op_math) {
-    if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
+  if (!tensorflow::tf32_execution_allowed()) {
+    if (!math_mode.Init(CUBLAS_DEFAULT_MATH)) {
       return false;
     }
   }
@@ -1621,21 +1634,9 @@ bool CUDABlas::DoBlasGemm(
     }
   }
 
-  bool use_tensor_ops = false;
-#if CUDA_VERSION >= 9000
-  int cc_major, cc_minor;
-  stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
-                                                                   &cc_minor);
-
-  // GPUs < sm_70 don't support tensor ops.
-  if (cc_major >= 7) {
-    use_tensor_ops = true;
-  }
-#endif
-
   return DoBlasInternalImpl(
       cublasSgemmEx, stream, true /* = pointer_mode_host */,
-      true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
+      true /* = err_on_failure= */, CUDABlasTranspose(transa),
       CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
       SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
       GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
@@ -2257,7 +2258,8 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor) &&
       cc_major >= 5) {
-    bool use_tensor_ops = data_type == CUDA_R_16F;
+    bool use_tensor_ops =
+        data_type == CUDA_R_16F || tensorflow::tf32_execution_allowed();
     cublasGemmAlgo_t algo =
         (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
     cudaDataType_t compute_type =
@@ -2271,7 +2273,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     bool ok;
     ok = DoBlasInternalImpl(
         AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
+        true /* = err_on_failure */, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
         b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
         batch_count, compute_type, algo);
@@ -2406,33 +2408,25 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
     int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
     int64 stride_c, int batch_count) {
-  bool use_tensor_ops = false;
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9010
   int cc_major, cc_minor;
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
-          &cc_major, &cc_minor)) {
-    // GPUs < sm_70 don't support tensor ops.
-    if (cc_major >= 7) {
-      use_tensor_ops = true;
+          &cc_major, &cc_minor) &&
+      cc_major >= 5) {
+    cublasGemmAlgo_t algo =
+        (cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+    bool ok = DoBlasInternalImpl(
+        AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
+        true /* = pointer_mode_host */, true /* = err_on_failure */,
+        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
+        GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb,
+        stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
+        batch_count, CUDA_R_32F, algo);
+    if (ok) {
+      return true;
     }
-#if CUDA_VERSION >= 9010
-    if (cc_major >= 5) {
-      cublasGemmAlgo_t algo =
-          (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
-      bool ok = DoBlasInternalImpl(
-          AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
-          true /* = pointer_mode_host */, true /* = err_on_failure */,
-          use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
-          m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
-          GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
-          CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
-      if (ok) {
-        return true;
-      }
-      LOG(ERROR) << "failed BLAS call, see log for details";
-      return false;
-    }
-#endif
+    LOG(ERROR) << "failed BLAS call, see log for details";
+    return false;
   }
 #endif
   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
@@ -2445,7 +2439,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
         reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
     bool ok = DoBlasInternalImpl(
         cublasSgemmEx, stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
+        true /* = err_on_failure= */, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
         lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
         SE_CUDA_DATA_HALF, ldc);
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 817bdb72777..556456c83db 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -83,7 +83,7 @@ class CUDABlas : public blas::BlasSupport {
   template <typename FuncT, typename... Args>
   bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                           bool pointer_mode_host, bool err_on_failure,
-                          bool use_tensor_op_math, Args... args);
+                          Args... args);
 
   // Convenience functions that call DoBlasInternalImpl with different values
   // for err_on_failure.
@@ -91,8 +91,7 @@ class CUDABlas : public blas::BlasSupport {
   bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
                       Args... args) {
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/true, /*use_tensor_ops=*/false,
-                              args...);
+                              /*err_on_failure=*/true, args...);
   }
   template <typename FuncT, typename... Args>
   bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
@@ -100,8 +99,7 @@ class CUDABlas : public blas::BlasSupport {
     // Tensor ops are hard-coded off in this path, but can still be enabled with
     // a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/false,
-                              /*use_tensor_ops=*/false, args...);
+                              /*err_on_failure=*/false, args...);
   }
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic

From 0c8343f7f3a8066c507b97ff84d8b298655cc5f4 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Mon, 8 Jun 2020 11:21:49 -0500
Subject: [PATCH 018/112] Address review comments

---
 tensorflow/core/platform/tf32_utils.cc | 10 ++++++----
 tensorflow/core/platform/tf32_utils.h  |  2 +-
 tensorflow/python/framework/config.py  |  7 +++++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tf32_utils.cc
index 715b5996dc3..4456e768c0a 100644
--- a/tensorflow/core/platform/tf32_utils.cc
+++ b/tensorflow/core/platform/tf32_utils.cc
@@ -14,14 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/tf32_utils.h"
+#include <atomic>
 
 namespace tensorflow {
 
-// TODO(nluehr): enable tf32 execution by default after TF32 Ampere testing.
-static bool tf32_enabled = false;
+// Whether TensorFloat-32 should be used where supported.
+// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
+static std::atomic<bool> tf32_allowed{false};
 
-void allow_tf32_execution(bool allow) { tf32_enabled = allow; }
+void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
 
-bool tf32_execution_allowed() { return tf32_enabled; }
+bool tf32_execution_allowed() { return tf32_allowed; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tf32_utils.h
index a0ce58f9bbd..7a158d00ad3 100644
--- a/tensorflow/core/platform/tf32_utils.h
+++ b/tensorflow/core/platform/tf32_utils.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-void allow_tf32_execution(bool allow);
+void allow_tf32_execution(bool allowed);
 
 bool tf32_execution_allowed();
 
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 042af4d1023..a356e6d9a16 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -23,6 +23,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
+
+# No tf_export until TF is built against CUDA11 which is required for TF32.
 def tensor_float32_execution_allowed():
   """Get if TensorFloat-32 operations are enabled on supported hardware.
 
@@ -31,7 +33,8 @@ def tensor_float32_execution_allowed():
   """
   return _pywrap_tf32_execution.is_allowed()
 
-def allow_tensor_float_32_execution(allow):
+# No tf_export until TF is built against CUDA11 which is required for TF32.
+def allow_tensor_float_32_execution(allowed):
   """Allow use of TensorFloat-32 with float32 ops on supported hardware.
 
   TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
@@ -47,7 +50,7 @@ def allow_tensor_float_32_execution(allow):
   Args:
     allow: whether to allow TensorFloat-32 execution
   """
-  _pywrap_tf32_execution.allow(allow)
+  _pywrap_tf32_execution.allow(allowed)
 
 @tf_export('config.threading.get_intra_op_parallelism_threads')
 def get_intra_op_parallelism_threads():

From ed41bb08250cad9f3ddbb6c7fb83e1216ee06031 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 8 Jun 2020 17:57:27 +0100
Subject: [PATCH 019/112] Fix for CI failure.

Change-Id: I66a5b5ab559207071ea62619e9e612fda9a73202
---
 tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt | 4 ++++
 .../tools/api/golden/v1/tensorflow.lite.constants.pbtxt       | 4 ++++
 tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
index c3199b24d98..9538fe382a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
@@ -1,6 +1,10 @@
 path: "tensorflow.lite.OpsSet"
 tf_class {
   is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+    mtype: "<enum \'OpsSet\'>"
+  }
   member {
     name: "SELECT_TF_OPS"
     mtype: "<enum \'OpsSet\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
index 27c227dac64..7f62da6662a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "GRAPHVIZ_DOT"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "INT16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "INT32"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
index c3199b24d98..9538fe382a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
@@ -1,6 +1,10 @@
 path: "tensorflow.lite.OpsSet"
 tf_class {
   is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+    mtype: "<enum \'OpsSet\'>"
+  }
   member {
     name: "SELECT_TF_OPS"
     mtype: "<enum \'OpsSet\'>"

From dbc7faeecdbdc223c67b03284ba5dc7d25668d3c Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 8 Jun 2020 20:20:22 +0100
Subject: [PATCH 020/112] Addressed reviewer's comment.

Change-Id: I5bda332514d8070731b807b750ee7a423d6b4d78
---
 tensorflow/lite/python/convert.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index b1095a469f6..52edb700195 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -94,10 +94,15 @@ class OpsSet(enum.Enum):
   # quantized implementations.
   TFLITE_BUILTINS_INT8 = "TFLITE_BUILTINS_INT8"
 
-  # Convert model using only TensorFlow Lite operations with quantized int8 weights
-  # and int16 activations.
+  # Convert model using only TensorFlow Lite operations with quantized int8 weights,
+  # int16 activations and int64 bias.
   # Specifying this will throw an error for operations that do not yet have
   # quantized implementations.
+  # This quantization mode should be used in models for super-resolution,
+  # audio signal processing or image de-noising. It improves accuracy
+  # significantly, but only slightly increases the model size.   
+  # WARNING: These ops are currently experimental and have not yet been finalized.
+  # They are only compatible with CPU execution, and have not been optimized for production.
   EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
 
   def __str__(self):

From 507c7549317221bcf5b418a66fd0212cd4a7443b Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 9 Jun 2020 17:47:16 +0100
Subject: [PATCH 021/112] Fix for pylint errors.

Change-Id: Idd96d7a41fd459c86ab0f6fbb63e5d543509145d
---
 tensorflow/lite/python/convert.py             |  3 ++-
 tensorflow/lite/python/lite.py                | 27 ++++++++++---------
 tensorflow/lite/python/lite_test.py           | 10 ++++---
 tensorflow/lite/python/optimize/calibrator.py |  3 ++-
 .../lite/python/optimize/calibrator_test.py   |  3 ++-
 5 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 52edb700195..68e23634b2e 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -103,7 +103,8 @@ class OpsSet(enum.Enum):
   # significantly, but only slightly increases the model size.   
   # WARNING: These ops are currently experimental and have not yet been finalized.
   # They are only compatible with CPU execution, and have not been optimized for production.
-  EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+  EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = \
+    "EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
 
   def __str__(self):
     return self.value
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 26c6f0855af..bed48860b00 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -251,7 +251,8 @@ class QuantizationMode(object):
                 self.post_training_fp16())
 
   def activations_type(self):
-    return constants.INT16 if self._is_int16x8_target_required() else constants.INT8
+    return constants.INT16 if self._is_int16x8_target_required() \
+      else constants.INT8
 
   def converter_flags(self, inference_ty=None, inference_input_ty=None):
     """Flags to the converter."""
@@ -262,7 +263,8 @@ class QuantizationMode(object):
 
     if self.training_time_int8_allow_float():
       return {
-          "inference_type": inference_ty if inference_ty else self.activations_type(),
+          "inference_type": inference_ty if inference_ty else \
+            self.activations_type(),
           "inference_input_type":
               inference_input_ty if inference_input_ty else constants.FLOAT,
           "post_training_quantize": False,  # disable dynamic range quantization
@@ -359,15 +361,15 @@ class QuantizationMode(object):
 
   def _is_int16x8_target_required(self):
     return bool(
-          set(self._target_spec.supported_ops).intersection([
-            OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
-        ]))
+      set(self._target_spec.supported_ops).intersection([
+        OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+      ]))
 
   def _is_allow_float(self):
     return bool(
-          set(self._target_spec.supported_ops).intersection([
-            OpsSet.TFLITE_BUILTINS
-        ]))
+      set(self._target_spec.supported_ops).intersection([
+        OpsSet.TFLITE_BUILTINS
+      ]))
 
   def _any_optimization_enabled(self):
     return bool(
@@ -441,7 +443,8 @@ class TFLiteConverterBase(object):
     return _get_grappler_config(optimizers)
 
   def _calibrate_quantize_model(self, result, inference_input_type,
-                                inference_output_type, activations_type, allow_float):
+                                inference_output_type, activations_type,
+                                allow_float):
     """Calibrate and quantize the model."""
     if not isinstance(self.representative_dataset, RepresentativeDataset):
       self.representative_dataset = RepresentativeDataset(
@@ -458,8 +461,8 @@ class TFLiteConverterBase(object):
       return _mlir_quantize(calibrated)
     else:
       return calibrate_quantize.calibrate_and_quantize(
-        self.representative_dataset.input_gen, inference_input_type,
-        inference_output_type, allow_float, activations_type)
+          self.representative_dataset.input_gen, inference_input_type,
+          inference_output_type, allow_float, activations_type)
 
   def _is_unknown_shapes_allowed(self):
     # Unknown dimensions are only allowed with the new converter.
@@ -1992,7 +1995,7 @@ class TocoConverter(object):
 
   @classmethod
   @_deprecation.deprecated(
-    None, "Use `lite.TFLiteConverter.from_keras_model_file` instead.")
+      None, "Use `lite.TFLiteConverter.from_keras_model_file` instead.")
   def from_keras_model_file(cls,
                             model_file,
                             input_arrays=None,
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 044b1211e17..cae49cb147f 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -882,11 +882,15 @@ class FromSessionTest(TestModels, parameterized.TestCase):
 
   @parameterized.named_parameters(
       # Quantize model to Int8: with enable mlir
-      ('UseTfliteBuiltinsIntEnableMLIR', [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
+      ('UseTfliteBuiltinsIntEnableMLIR',
+        [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
       # Quantize model to Int8: with disable mlir
-      ('UseTfliteBuiltinsIntDisableMLIR', [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
+      ('UseTfliteBuiltinsIntDisableMLIR',
+        [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
       # Quantize model to Int16: with disable mlir
-      ('UseTfliteBuiltinsInt16DisableMLIR', [lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8], False))
+      ('UseTfliteBuiltinsInt16DisableMLIR',
+        [lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
+        False))
   def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index 90c43fcddfa..2b08ec690ff 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -78,7 +78,8 @@ class Calibrator(object):
                    computation, useful when targeting an integer-only backend.
                    If False, an error will be thrown if an operation cannot be
                    quantized, otherwise the model will fallback to float ops.
-      activations_type: A tf.dtype representing the desired type for activations.
+      activations_type: A tf.dtype representing the desired type for
+                   activations.
       resize_input: A boolean. True if the shape of the sample data is different
         from the input.
     """
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index f778c8a555d..d79d76b09ed 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -96,7 +96,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       ('UseActivationTypeInt8 - EnableMlirQuantizer', constants.INT8),
       # Activation type Int16 
       ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', constants.INT16))
-  def test_calibration_with_quantization_multiple_inputs(self, activations_type):
+  def test_calibration_with_quantization_multiple_inputs(self,
+                                                         activations_type):
     # Load multi add model from test data.
     # This model has 4 inputs of size (1, 8, 8, 3).
     model_path = resource_loader.get_path_to_datafile(

From 3fc256a0dc31eae6711d2f7680493925f0fa4091 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 9 Jun 2020 17:51:37 +0100
Subject: [PATCH 022/112] Fix for pylint.

Change-Id: If2674380c25eb8973e73a407b75660088098e6da
---
 tensorflow/lite/python/lite_test.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index cae49cb147f..e6661c82894 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -883,14 +883,15 @@ class FromSessionTest(TestModels, parameterized.TestCase):
   @parameterized.named_parameters(
       # Quantize model to Int8: with enable mlir
       ('UseTfliteBuiltinsIntEnableMLIR',
-        [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
+       [lite.OpsSet.TFLITE_BUILTINS_INT8], True),
       # Quantize model to Int8: with disable mlir
       ('UseTfliteBuiltinsIntDisableMLIR',
-        [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
+       [lite.OpsSet.TFLITE_BUILTINS_INT8], False),
       # Quantize model to Int16: with disable mlir
       ('UseTfliteBuiltinsInt16DisableMLIR',
-        [lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
-        False))
+       [lite.OpsSet.\
+       EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
+       False))
   def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()

From 47674cac85b61f7a438c6970a6a7ca49946a2622 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 9 Jun 2020 22:27:57 +0100
Subject: [PATCH 023/112] Fix for pylint

Change-Id: If03f60a3eebc7aed61c10870c545fe6035bcb2a3
---
 tensorflow/lite/python/lite.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index bed48860b00..06796ba820b 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -361,15 +361,15 @@ class QuantizationMode(object):
 
   def _is_int16x8_target_required(self):
     return bool(
-      set(self._target_spec.supported_ops).intersection([
-        OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
-      ]))
+        set(self._target_spec.supported_ops).intersection([
+            OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]))
 
   def _is_allow_float(self):
     return bool(
-      set(self._target_spec.supported_ops).intersection([
-        OpsSet.TFLITE_BUILTINS
-      ]))
+        set(self._target_spec.supported_ops).intersection([
+            OpsSet.TFLITE_BUILTINS
+        ]))
 
   def _any_optimization_enabled(self):
     return bool(

From 84afc268a77f543fe64ecb45832701278a9eb129 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Fri, 5 Jun 2020 11:47:09 +0100
Subject: [PATCH 024/112] Documentation on the new experimental option for
 16x8.

---
 .../performance/post_training_quantization.md | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index af7d9dbf02d..c48a2820d2f 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -151,6 +151,49 @@ The disadvantages of float16 quantization are as follows:
     to float32 when run on the CPU. (Note that the GPU delegate will not perform
     this dequantization, since it can operate on float16 data.)
 
+### Integer only: 16-bit activations with 8-bit weights (experimental)
+
+This is an experimental quantization scheme. It is similar to the "integer only"
+scheme, but activations are quantized based on their range to 16-bits, weights are
+quantized in 8-bit integer and bias is quantized into 64-bit integer.
+This is referred to as 16x8 quantization further.
+
+The main advantage of this quantization is that it can improve accuracy
+significantly, but only slightly increase model size.
+
+<pre>
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]</b>
+tflite_quant_model = converter.convert()
+</pre>
+
+If 16x8 quantization is not supported for some operators in the model,
+then the model still can be quantized, but unsupported operators kept in float.
+The following option should be added to the target_spec to allow this.
+<pre>
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+<b>tf.lite.OpsSet.TFLITE_BUILTINS</b>]
+tflite_quant_model = converter.convert()
+</pre>
+
+Examples of the use cases where accuracy improvements provided by this quantization scheme include:
+*   super-resolution,
+*   audio signal processing such as noise cancelling and beamforming,
+*   image de-noising,
+*   HDR reconstruction from a single image.
+
+The disadvantage of this quantization is:
+
+*   Currently inference is noticeably slower than 8-bit full integer due to the lack of optimized kernel implementation.
+*   Currently it is incompatible with the existing hardware accelerated TFLite delegates.
+
+Note: This is an experimental feature.
+
 ### Model accuracy
 
 Since weights are quantized post training, there could be an accuracy loss,

From dcfc2175c79ee6c610770b597c8d637daa1649bc Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Tue, 9 Jun 2020 22:35:57 +0100
Subject: [PATCH 025/112] Small change of comment per reviewer's note.

Change-Id: I1233b95282befebfa0e6c06173f5e928aef60b22
---
 tensorflow/lite/python/convert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 68e23634b2e..ec70f793f21 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -98,9 +98,9 @@ class OpsSet(enum.Enum):
   # int16 activations and int64 bias.
   # Specifying this will throw an error for operations that do not yet have
   # quantized implementations.
-  # This quantization mode should be used in models for super-resolution,
+  # This quantization mode may be used in models for super-resolution,
   # audio signal processing or image de-noising. It improves accuracy
-  # significantly, but only slightly increases the model size.   
+  # significantly, but only slightly increases the model size.
   # WARNING: These ops are currently experimental and have not yet been finalized.
   # They are only compatible with CPU execution, and have not been optimized for production.
   EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8 = \

From aaf693d29bb5a52d6dfd8c106e45b2ff513e6d84 Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Tue, 2 Jun 2020 08:07:53 -0700
Subject: [PATCH 026/112] Added code to build multiinstance/multinode container

---
 .../ci_build/linux/mkl/Dockerfile.devel-mkl   | 16 ++++--
 .../ci_build/linux/mkl/build-dev-container.sh | 26 ++++++++++
 .../linux/mkl/install_openmpi_horovod.sh      | 49 +++++++++++++++++++
 3 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100755 tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 45ccf67d707..3893f61d940 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -15,6 +15,11 @@ ARG CONFIG_BFLOAT16_BUILD=""
 ARG ENABLE_SECURE_BUILD
 ARG BAZEL_VERSION=""
 ARG ENABLE_DNNL1=""
+ARG ENABLE_HOROVOD=""
+ARG OPENMPI_VERSION=""
+ARG OPENMPI_DOWNLOAD_URL=""
+
+ENV DEBIAN_FRONTEND=noninteractive
 
 # Upgrade Bazel version if argument is passed
 RUN if [ "${BAZEL_VERSION}" != "" ]; then \
@@ -45,9 +50,6 @@ RUN ${PYTHON} set-build-env.py -p ${TARGET_PLATFORM} -f /root/.mkl.bazelrc \
 # Pull the compiler flags we just wrote into root user's .bazelrc file
 RUN echo "import /root/.mkl.bazelrc" >>/root/.bazelrc
 
-# Install futures>=0.17.1 for Python2.7 compatibility mode
-RUN ${PIP} install future>=0.17.1
-
 RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \
@@ -55,6 +57,14 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     rm -rf /root/.cache
     # Clean up Bazel cache when done.
 
+#Install OpenMPI/Horovod
+COPY install_openmpi_horovod.sh .
+RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
+        chmod +x install_openmpi_horovod.sh && \
+        ${OPENMPI_VERSION} ${OPENMPI_DOWNLOAD_URL} install_openmpi_horovod.sh && \
+        rm -rf install_openmpi_horovod.sh; \
+    fi
+
 # TensorBoard
 EXPOSE 6006
 # IPython
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index eceef65aa38..da647153cdb 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -64,6 +64,9 @@ ENABLE_SECURE_BUILD=${ENABLE_SECURE_BUILD:-no}
 BAZEL_VERSION=${BAZEL_VERSION}
 BUILD_PY2_CONTAINERS=${BUILD_PY2_CONTAINERS:-no}
 ENABLE_DNNL1=${ENABLE_DNNL1:-no}
+ENABLE_HOROVOD=${ENABLE_HOROVOD:-no}
+OPENMPI_VERSION=${OPENMPI_VERSION}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}
 
 debug "ROOT_CONTAINER=${ROOT_CONTAINER}"
 debug "TF_ROOT_CONTAINER_TAG=${TF_ROOT_CONTAINER_TAG}"
@@ -82,6 +85,9 @@ debug "TMP_DIR=${TMP_DIR}"
 debug "BAZEL_VERSION=${BAZEL_VERSION}"
 debug "BUILD_PY2_CONTAINERS=${BUILD_PY2_CONTAINERS}"
 debug "ENABLE_DNNL1=${ENABLE_DNNL1}"
+debug "ENABLE_HOROVOD=${ENABLE_HOROVOD}"
+debug "OPENMPI_VERSION=${OPENMPI_VERSION}"
+debug "OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}"
 
 function build_container()
 {
@@ -131,6 +137,13 @@ function build_container()
     TF_DOCKER_BUILD_ARGS+=("--build-arg BAZEL_VERSION=${BAZEL_VERSION}")
   fi
 
+  # Add build arg for installing OpenMPI/Horovod
+  if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
+    TF_DOCKER_BUILD_ARGS+=("--build-arg ENABLE_HOROVOD=${ENABLE_HOROVOD}")
+    TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_VERSION=${OPENMPI_VERSION}")
+    TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}")
+  fi
+
   # Perform docker build
   debug "Building docker image with image name and tag: ${TEMP_IMAGE_NAME}"
   CMD="${DOCKER_BINARY} build ${TF_DOCKER_BUILD_ARGS[@]} --no-cache --pull -t ${TEMP_IMAGE_NAME} -f Dockerfile.devel-mkl ."
@@ -188,6 +201,19 @@ function test_container()
       die "FAIL: MKL enabled test in ${TEMP_IMAGE_NAME}"
   fi
 
+  # Test to check if horovod is installed successfully
+  debug "Test horovod in the container..."
+  if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
+      HOROVOD_TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'")  
+      ${HOROVOD_TEST_CMD}
+      if [[ $? == "0" ]]; then
+          echo "PASS: HOROVOD installation test in ${TEMP_IMAGE_NAME}"
+      else
+          die "FAIL: HOROVOD installation test in ${TEMP_IMAGE_NAME}"
+      fi
+  fi
+  
+
   # Stop the running docker container
   sleep 1
   "${DOCKER_BINARY}" stop --time=0 ${CONTAINER_ID}
diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
new file mode 100755
index 00000000000..d1b297726ed
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# install OpenMPI, OpenSSH and Horovod
+
+set -e
+
+apt-get clean && apt-get update -y
+
+# Install Open MPI
+OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz}
+echo "Installing OpenMPI version ${OPENMPI_VERSION}..."
+echo "OpenMPI Download url ${OPENMPI_DOWNLOAD_URL}..."
+
+mkdir /tmp/openmpi
+cd /tmp/openmpi
+curl -fSsL -O ${OPENMPI_DOWNLOAD_URL}
+tar zxf ${OPENMPI_VERSION}.tar.gz
+cd ${OPENMPI_VERSION}
+./configure --enable-mpirun-prefix-by-default
+make -j $(nproc) all
+make install
+ldconfig
+cd /
+rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real
+echo '#!/bin/bash' > /usr/local/bin/mpirun
+echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun
+chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
+
+#Check mpi version
+echo 'OpenMPI version:'
+mpirun --version
+
+# Install OpenSSH for MPI to communicate between containers
+apt-get install -y --no-install-recommends --fix-missing openssh-client openssh-server libnuma-dev
+mkdir -p /var/run/sshd
+# Allow OpenSSH to talk to containers without asking for confirmation
+cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new
+echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
+mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+#Install Horovod
+HOROVOD_WITH_TENSORFLOW=1
+python3 -m pip install --no-cache-dir horovod==0.19.1

From ff359d4a48aeb1905f767d32e7da1a2d01d4ce6a Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Tue, 2 Jun 2020 13:46:16 -0700
Subject: [PATCH 027/112] Setting default values for OpenMPI versions

---
 .../tools/ci_build/linux/mkl/Dockerfile.devel-mkl     |  2 +-
 .../ci_build/linux/mkl/install_openmpi_horovod.sh     | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 3893f61d940..1fd54ff703f 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -61,7 +61,7 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
 COPY install_openmpi_horovod.sh .
 RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
         chmod +x install_openmpi_horovod.sh && \
-        ${OPENMPI_VERSION} ${OPENMPI_DOWNLOAD_URL} install_openmpi_horovod.sh && \
+        ./install_openmpi_horovod.sh ${OPENMPI_VERSION} ${OPENMPI_DOWNLOAD_URL} && \
         rm -rf install_openmpi_horovod.sh; \
     fi
 
diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index d1b297726ed..4c8b04f6024 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -5,9 +5,16 @@ set -e
 
 apt-get clean && apt-get update -y
 
+# Set default
+if [[ $# -gt 1 ]]; then
+  OPENMPI_VERSION="${1}"
+  OPENMPI_DOWNLOAD_URL="${2}"
+else
+  OPENMPI_VERSION=openmpi-2.1.1
+  OPENMPI_DOWNLOAD_URL=https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz  
+fi
+
 # Install Open MPI
-OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
-OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz}
 echo "Installing OpenMPI version ${OPENMPI_VERSION}..."
 echo "OpenMPI Download url ${OPENMPI_DOWNLOAD_URL}..."
 

From aa5bfd35fa5292d820493483da8540f8a6386c5f Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 3 Jun 2020 12:15:24 -0700
Subject: [PATCH 028/112] Added license to the shell script

---
 .../linux/mkl/install_openmpi_horovod.sh        | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index 4c8b04f6024..0f5a670f0f2 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -1,5 +1,20 @@
 #!/usr/bin/env bash
-# install OpenMPI, OpenSSH and Horovod
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Install OpenMPI, OpenSSH and Horovod in Intel(R) MKL support
+# Usage: install_openmpi_horovod.sh [openmpi version] [openmpi download url]
 
 set -e
 

From 615d3ce1af92614a3285807caaf42f50acd66fae Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 3 Jun 2020 13:14:39 -0700
Subject: [PATCH 029/112] Added install futures for backward compatibility

---
 tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 1fd54ff703f..f4ab7ba21c4 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -50,6 +50,9 @@ RUN ${PYTHON} set-build-env.py -p ${TARGET_PLATFORM} -f /root/.mkl.bazelrc \
 # Pull the compiler flags we just wrote into root user's .bazelrc file
 RUN echo "import /root/.mkl.bazelrc" >>/root/.bazelrc
 
+# Install futures>=0.17.1 for Python2.7 compatibility mode
+RUN ${PIP} install future>=0.17.1
+
 RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \

From 7a048082c1c7aa9057c8448b77ee4cde069ec3a7 Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 3 Jun 2020 13:19:53 -0700
Subject: [PATCH 030/112] Removed extra line

---
 tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index da647153cdb..7278724ff64 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -202,8 +202,8 @@ function test_container()
   fi
 
   # Test to check if horovod is installed successfully
-  debug "Test horovod in the container..."
   if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
+      debug "Test horovod in the container..."
       HOROVOD_TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'")  
       ${HOROVOD_TEST_CMD}
       if [[ $? == "0" ]]; then
@@ -213,7 +213,6 @@ function test_container()
       fi
   fi
   
-
   # Stop the running docker container
   sleep 1
   "${DOCKER_BINARY}" stop --time=0 ${CONTAINER_ID}

From 50f0ba885bc112b24b437c8a974c6a8deaace96b Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 3 Jun 2020 20:05:47 -0700
Subject: [PATCH 031/112] Added parameter to pass horovod version and fixed
 comments

---
 .../ci_build/linux/mkl/Dockerfile.devel-mkl   |  5 ++--
 .../ci_build/linux/mkl/build-dev-container.sh |  3 +++
 .../linux/mkl/install_openmpi_horovod.sh      | 26 ++++++++-----------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index f4ab7ba21c4..8a5a0a42050 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -18,6 +18,7 @@ ARG ENABLE_DNNL1=""
 ARG ENABLE_HOROVOD=""
 ARG OPENMPI_VERSION=""
 ARG OPENMPI_DOWNLOAD_URL=""
+ARG HOROVOD_VERSION=""
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -60,11 +61,11 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     rm -rf /root/.cache
     # Clean up Bazel cache when done.
 
-#Install OpenMPI/Horovod
+# Install OpenMPI/Horovod
 COPY install_openmpi_horovod.sh .
 RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
         chmod +x install_openmpi_horovod.sh && \
-        ./install_openmpi_horovod.sh ${OPENMPI_VERSION} ${OPENMPI_DOWNLOAD_URL} && \
+        ./install_openmpi_horovod.sh OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} HOROVOD_VERSION=${HOROVOD_VERSION} && \
         rm -rf install_openmpi_horovod.sh; \
     fi
 
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index 7278724ff64..e9d7f1ff388 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -67,6 +67,7 @@ ENABLE_DNNL1=${ENABLE_DNNL1:-no}
 ENABLE_HOROVOD=${ENABLE_HOROVOD:-no}
 OPENMPI_VERSION=${OPENMPI_VERSION}
 OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}
+HOROVOD_VERSION=${HOROVOD_VERSION}
 
 debug "ROOT_CONTAINER=${ROOT_CONTAINER}"
 debug "TF_ROOT_CONTAINER_TAG=${TF_ROOT_CONTAINER_TAG}"
@@ -88,6 +89,7 @@ debug "ENABLE_DNNL1=${ENABLE_DNNL1}"
 debug "ENABLE_HOROVOD=${ENABLE_HOROVOD}"
 debug "OPENMPI_VERSION=${OPENMPI_VERSION}"
 debug "OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}"
+debug "HOROVOD_VERSION=${HOROVOD_VERSION}"
 
 function build_container()
 {
@@ -142,6 +144,7 @@ function build_container()
     TF_DOCKER_BUILD_ARGS+=("--build-arg ENABLE_HOROVOD=${ENABLE_HOROVOD}")
     TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_VERSION=${OPENMPI_VERSION}")
     TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}")
+    TF_DOCKER_BUILD_ARGS+=("--build-arg HOROVOD_VERSION=${HOROVOD_VERSION}")
   fi
 
   # Perform docker build
diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index 0f5a670f0f2..b8d9739ceb6 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,25 +13,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Install OpenMPI, OpenSSH and Horovod in Intel(R) MKL support
-# Usage: install_openmpi_horovod.sh [openmpi version] [openmpi download url]
+# Install OpenMPI, OpenSSH and Horovod during Intel(R) MKL container build
+# Usage: install_openmpi_horovod.sh [OPENMPI_VERSION=<openmpi version>] [OPENMPI_DOWNLOAD_URL=<openmpi download url>] [HOROVOD_VERSION=<horovod version>]
 
 set -e
 
 apt-get clean && apt-get update -y
 
 # Set default
-if [[ $# -gt 1 ]]; then
-  OPENMPI_VERSION="${1}"
-  OPENMPI_DOWNLOAD_URL="${2}"
-else
-  OPENMPI_VERSION=openmpi-2.1.1
-  OPENMPI_DOWNLOAD_URL=https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz  
-fi
+OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz}
+HOROVOD_VERSION=${HOROVOD_VERSION:-0.19.1}
 
 # Install Open MPI
-echo "Installing OpenMPI version ${OPENMPI_VERSION}..."
-echo "OpenMPI Download url ${OPENMPI_DOWNLOAD_URL}..."
+echo "Installing OpenMPI version ${OPENMPI_VERSION} ..."
+echo "OpenMPI Download url ${OPENMPI_DOWNLOAD_URL} ..."
 
 mkdir /tmp/openmpi
 cd /tmp/openmpi
@@ -54,7 +50,7 @@ chmod a+x /usr/local/bin/mpirun
 # Configure OpenMPI to run good defaults:
 echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
 
-#Check mpi version
+# Check mpi version
 echo 'OpenMPI version:'
 mpirun --version
 
@@ -66,6 +62,6 @@ cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.ne
 echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
 mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
 
-#Install Horovod
+# Install Horovod
 HOROVOD_WITH_TENSORFLOW=1
-python3 -m pip install --no-cache-dir horovod==0.19.1
+python3 -m pip install --no-cache-dir horovod==${HOROVOD_VERSION}

From 1ed2ab4638c56b3cce6d0f85efeaea3600b75214 Mon Sep 17 00:00:00 2001
From: justkw <karen.wu@intel.com>
Date: Thu, 4 Jun 2020 09:22:07 -0700
Subject: [PATCH 032/112] Adding parameter to use --nightly_flag to install
 specific packages if building the nightly build

---
 tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl   | 5 +++--
 tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh | 7 +++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 8a5a0a42050..a78d13c7755 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -19,6 +19,7 @@ ARG ENABLE_HOROVOD=""
 ARG OPENMPI_VERSION=""
 ARG OPENMPI_DOWNLOAD_URL=""
 ARG HOROVOD_VERSION=""
+ARG TF_NIGHTLY_FLAG=""
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -56,8 +57,8 @@ RUN ${PIP} install future>=0.17.1
 
 RUN bazel --bazelrc=/root/.bazelrc build -c opt \
     tensorflow/tools/pip_package:build_pip_package && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package "${WHL_DIR}" && \
-    ${PIP} --no-cache-dir install --upgrade "${WHL_DIR}"/tensorflow-*.whl && \
+    bazel-bin/tensorflow/tools/pip_package/build_pip_package "${TF_NIGHTLY_FLAG}" "${WHL_DIR}" && \
+    ${PIP} --no-cache-dir install --upgrade "${WHL_DIR}"/*.whl && \
     rm -rf /root/.cache
     # Clean up Bazel cache when done.
 
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index e9d7f1ff388..83b3ebaf9c9 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -68,6 +68,7 @@ ENABLE_HOROVOD=${ENABLE_HOROVOD:-no}
 OPENMPI_VERSION=${OPENMPI_VERSION}
 OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}
 HOROVOD_VERSION=${HOROVOD_VERSION}
+IS_NIGHTLY=${IS_NIGHTLY:-no}
 
 debug "ROOT_CONTAINER=${ROOT_CONTAINER}"
 debug "TF_ROOT_CONTAINER_TAG=${TF_ROOT_CONTAINER_TAG}"
@@ -90,6 +91,7 @@ debug "ENABLE_HOROVOD=${ENABLE_HOROVOD}"
 debug "OPENMPI_VERSION=${OPENMPI_VERSION}"
 debug "OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}"
 debug "HOROVOD_VERSION=${HOROVOD_VERSION}"
+debug "IS_NIGHTLY=${IS_NIGHTLY}"
 
 function build_container()
 {
@@ -147,6 +149,11 @@ function build_container()
     TF_DOCKER_BUILD_ARGS+=("--build-arg HOROVOD_VERSION=${HOROVOD_VERSION}")
   fi
 
+  # Add build arg --nightly_flag for the nightly build
+  if [[ ${IS_NIGHTLY} == "yes" ]]; then
+    TF_DOCKER_BUILD_ARGS+=("--build-arg TF_NIGHTLY_FLAG=--nightly_flag")
+  fi
+
   # Perform docker build
   debug "Building docker image with image name and tag: ${TEMP_IMAGE_NAME}"
   CMD="${DOCKER_BINARY} build ${TF_DOCKER_BUILD_ARGS[@]} --no-cache --pull -t ${TEMP_IMAGE_NAME} -f Dockerfile.devel-mkl ."

From 83decf0d382b76771e2b1ad4fa43d208d5e40eb0 Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Mon, 8 Jun 2020 07:50:14 -0700
Subject: [PATCH 033/112] Support multiple OS

---
 .../ci_build/linux/mkl/install_openmpi_horovod.sh   | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index b8d9739ceb6..6044927d2ce 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -55,7 +55,18 @@ echo 'OpenMPI version:'
 mpirun --version
 
 # Install OpenSSH for MPI to communicate between containers
-apt-get install -y --no-install-recommends --fix-missing openssh-client openssh-server libnuma-dev
+( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        libnuma-dev \
+        openssh-server \
+        openssh-client \        
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* ) || \
+    ( yum -y update && yum -y install \
+            numactl-devel \
+            openssh-server \
+            openssh-clients \            
+    yum clean all ) || \
+    ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
 mkdir -p /var/run/sshd
 # Allow OpenSSH to talk to containers without asking for confirmation
 cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new

From 371c2e6f4f3f233041eda2d292a13824d98d769f Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Mon, 8 Jun 2020 08:52:49 -0700
Subject: [PATCH 034/112] Bug fix

---
 .../tools/ci_build/linux/mkl/install_openmpi_horovod.sh       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index 6044927d2ce..276d9945ab6 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -58,13 +58,13 @@ mpirun --version
 ( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
         libnuma-dev \
         openssh-server \
-        openssh-client \        
+        openssh-client && \        
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* ) || \
     ( yum -y update && yum -y install \
             numactl-devel \
             openssh-server \
-            openssh-clients \            
+            openssh-clients && \            
     yum clean all ) || \
     ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
 mkdir -p /var/run/sshd

From da18384ad585b2d88a08119268c9a7134ee36bf5 Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Mon, 8 Jun 2020 09:51:25 -0700
Subject: [PATCH 035/112] Bug fix

---
 tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index 276d9945ab6..b765dbd70a6 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -58,7 +58,7 @@ mpirun --version
 ( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
         libnuma-dev \
         openssh-server \
-        openssh-client && \        
+        openssh-clients && \        
     apt-get clean && \
     rm -rf /var/lib/apt/lists/* ) || \
     ( yum -y update && yum -y install \

From ab86bb82faabec7b1d29c61df1cae0b45d0b0e8e Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Tue, 9 Jun 2020 07:53:05 -0700
Subject: [PATCH 036/112] Bug fix

---
 .../linux/mkl/install_openmpi_horovod.sh      | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index b765dbd70a6..aec40543a17 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -14,15 +14,14 @@
 # limitations under the License.
 # ==============================================================================
 # Install OpenMPI, OpenSSH and Horovod during Intel(R) MKL container build
-# Usage: install_openmpi_horovod.sh [OPENMPI_VERSION=<openmpi version>] [OPENMPI_DOWNLOAD_URL=<openmpi download url>] [HOROVOD_VERSION=<horovod version>]
+# Usage: install_openmpi_horovod.sh [OPENMPI_VERSION=<openmpi version>] [OPENMPI_DOWNLOAD_URL=<openmpi download url>] 
+# [HOROVOD_VERSION=<horovod version>]
 
 set -e
 
-apt-get clean && apt-get update -y
-
 # Set default
 OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
-OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/${OPENMPI_VERSION}.tar.gz}
+OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.1.tar.gz}
 HOROVOD_VERSION=${HOROVOD_VERSION:-0.19.1}
 
 # Install Open MPI
@@ -55,18 +54,20 @@ echo 'OpenMPI version:'
 mpirun --version
 
 # Install OpenSSH for MPI to communicate between containers
-( apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-        libnuma-dev \
-        openssh-server \
-        openssh-clients && \        
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* ) || \
-    ( yum -y update && yum -y install \
-            numactl-devel \
-            openssh-server \
-            openssh-clients && \            
-    yum clean all ) || \
-    ( echo "Unsupported Linux distribution. Aborting!" && exit 1 )
+apt-get clean && apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    openssh-client openssh-server libnuma-dev && \
+    rm -rf /var/lib/apt/lists/*
+if [[ $?  == "0" ]]; then
+    echo "PASS: OpenSSH installation"
+else
+    yum -y update && yum -y install numactl-devel openssh-server openssh-clients && \
+        yum clean all
+    if [[ $?  == "0" ]]; then
+        echo "PASS: OpenSSH installation"
+    else
+        echo "Unsupported Linux distribution. Aborting!" && exit 1
+    fi
+fi
 mkdir -p /var/run/sshd
 # Allow OpenSSH to talk to containers without asking for confirmation
 cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new

From ae408bb512e614469e24ccf0db6c031f6aeac030 Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Tue, 9 Jun 2020 18:00:47 -0700
Subject: [PATCH 037/112] remvoe trailing white space

---
 tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index 83b3ebaf9c9..6e789a54e87 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -214,7 +214,7 @@ function test_container()
   # Test to check if horovod is installed successfully
   if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
       debug "Test horovod in the container..."
-      HOROVOD_TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'")  
+      HOROVOD_TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'")
       ${HOROVOD_TEST_CMD}
       if [[ $? == "0" ]]; then
           echo "PASS: HOROVOD installation test in ${TEMP_IMAGE_NAME}"

From 013ddd96d0ce111ca5ec1422b2899b66ec41a036 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Tue, 9 Jun 2020 15:35:47 -0500
Subject: [PATCH 038/112] Use CUDNN_TENSOR_OP_MATH to enable tensor cores.

---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index fa06d410323..28ec6a842bb 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -723,7 +723,7 @@ class CudnnConvolutionDescriptor {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
 #if CUDNN_VERSION >= 8000
-        (use_tensor_op_math ? CUDNN_DEFAULT_MATH : CUDNN_FMA_MATH);
+        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
 #else
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
 #endif
@@ -1179,8 +1179,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
 
     cudnnMathType_t math_type;
     if (use_tensor_ops) {
-      math_type =
-          CUDNN_VERSION >= 8000 ? CUDNN_DEFAULT_MATH : CUDNN_TENSOR_OP_MATH;
+      math_type = CUDNN_TENSOR_OP_MATH;
     } else {
       math_type = CUDNN_VERSION >= 8000 ? CUDNN_FMA_MATH : CUDNN_DEFAULT_MATH;
     }

From da0d85808e77aa62287ba22e822a9e83866a43a4 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Thu, 11 Jun 2020 15:35:04 -0500
Subject: [PATCH 039/112] Make python names consistent

---
 tensorflow/python/framework/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index a356e6d9a16..1ff2fa613da 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -34,7 +34,7 @@ def tensor_float32_execution_allowed():
   return _pywrap_tf32_execution.is_allowed()
 
 # No tf_export until TF is built against CUDA11 which is required for TF32.
-def allow_tensor_float_32_execution(allowed):
+def allow_tensor_float32_execution(allowed):
   """Allow use of TensorFloat-32 with float32 ops on supported hardware.
 
   TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.

From d03b86fe4462ebad1f73d460c2aceab47372b239 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Thu, 11 Jun 2020 23:05:40 +0100
Subject: [PATCH 040/112] Extended test to the case when new converter is
 enabled.

Change-Id: I83f20d025027ad1266f99f9d79932cab4f1a9ed5
---
 tensorflow/lite/python/lite_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index e6661c82894..478840c5549 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -891,7 +891,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
       ('UseTfliteBuiltinsInt16DisableMLIR',
        [lite.OpsSet.\
        EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
-       False))
+       False),
+      ('UseTfliteBuiltinsInt16EnableMLIR',
+       [lite.OpsSet.\
+       EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8],
+       True))
   def testCalibrateAndQuantizeBuiltinInt(self, supported_ops, enable_mlir):
     with ops.Graph().as_default():
       inp, output, calibration_gen = self._getCalibrationQuantizeModel()

From 4bd42cdd0e68cebe8b280b323cde4d01a9d2bf3a Mon Sep 17 00:00:00 2001
From: Reed <reedwm@google.com>
Date: Thu, 11 Jun 2020 17:16:28 -0700
Subject: [PATCH 041/112] Use float_32 instead of float32 in function names

---
 tensorflow/python/framework/config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 1ff2fa613da..bbaa2ca8248 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -25,7 +25,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # No tf_export until TF is built against CUDA11 which is required for TF32.
-def tensor_float32_execution_allowed():
+def tensor_float_32_execution_allowed():
   """Get if TensorFloat-32 operations are enabled on supported hardware.
 
   Returns:
@@ -34,7 +34,7 @@ def tensor_float32_execution_allowed():
   return _pywrap_tf32_execution.is_allowed()
 
 # No tf_export until TF is built against CUDA11 which is required for TF32.
-def allow_tensor_float32_execution(allowed):
+def allow_tensor_float_32_execution(allowed):
   """Allow use of TensorFloat-32 with float32 ops on supported hardware.
 
   TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.

From 01b30fa03f636396f977a76628123199b772463a Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 15 Jun 2020 17:30:29 +0100
Subject: [PATCH 042/112] Fix for the linter.

Change-Id: Ie1185cc2cca9157655b22e1d3bb49ddc017a8f0e
---
 tensorflow/lite/tools/optimize/quantization_utils.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
index cdf2743585e..cdc794c20c4 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -92,7 +92,6 @@ void GetSymmetricQuantizationParams(
   min = std::min(min, 0.0f);
   max = std::max(max, 0.0f);
   const float scale = std::max(std::abs(max), std::abs(min)) / half_quant_range;
-  int64_t zero_point = 0;
   quantization_params->min = std::vector<float>(1, min);
   quantization_params->max = std::vector<float>(1, max);
   quantization_params->scale = std::vector<float>(1, scale);

From 5016da312802f8372672ccbfa7a4207b8683a8e5 Mon Sep 17 00:00:00 2001
From: sshiddib <sharada.shiddibhavi@intel.com>
Date: Mon, 15 Jun 2020 13:21:21 -0700
Subject: [PATCH 043/112] [Intel MKL] Adding DNNL ops (part 2) supporting
 threadpool work

---
 tensorflow/core/kernels/mkl_aggregate_ops.cc |  7 +++++--
 tensorflow/core/kernels/mkl_concat_op.cc     | 17 ++++++++++++++---
 tensorflow/core/kernels/mkl_dequantize_op.cc |  7 +++++--
 tensorflow/core/kernels/mkl_lrn_op.cc        |  6 ++++--
 tensorflow/core/kernels/mkl_transpose_op.cc  |  5 +++--
 tensorflow/core/util/mkl_util.h              | 14 ++++++++++++--
 6 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index ec5f80cb3fa..90e0ea9aa95 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -178,6 +178,9 @@ class MklAddNOp : public OpKernel {
         dnn_fmt = MklTensorFormatToMklDnnDataFormat(mkl_data_format);
       }
 
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(ctx, cpu_engine));
+
       // Create memory descriptor for MKL-DNN.
       // If all input in Tensorflow format, create block memory descriptor,
       // else convert TF format to MKL memory descriptor
@@ -215,6 +218,7 @@ class MklAddNOp : public OpKernel {
         srcs_pd.push_back(memory::primitive_desc(md, cpu_engine));
 #endif
         src.SetUsrMem(md, &src_tensor);
+        src.SetUsrMemDataHandle(&src_tensor, fwd_cpu_stream);
         inputs.push_back(src.GetOpMem());
       }
 
@@ -240,11 +244,10 @@ class MklAddNOp : public OpKernel {
       }
       AllocateOutputSetMklShape(ctx, kOutputIdx, &dst_tensor, output_tf_shape,
                                 output_mkl_shape);
-      dst.SetUsrMemDataHandle(dst_tensor);
+      dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
 
       // Create Sum op, and submit net for execution.
       std::vector<primitive> net;
-      stream* fwd_cpu_stream = CreateStream(ctx, cpu_engine);
 #ifdef ENABLE_MKLDNN_V1
       mkldnn::sum sum_op(sum_pd);
       std::unordered_map<int, memory> net_args = {
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 976f778424e..4a5cb0a0d4f 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -281,11 +281,19 @@ class MklConcatFwdPrimitive : public MklPrimitive {
                std::shared_ptr<stream> fwd_stream) {
     DCHECK_EQ(in_data.size(), context_.data_mem.size());
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
+#ifdef ENABLE_MKLDNN_THREADPOOL
+      context_.data_mem_shdptr[i]->set_data_handle(
+          static_cast<void*>(in_data[i].get_data_handle()), *fwd_stream);
+    }
+    context_.dst_mem->set_data_handle(
+        static_cast<void*>(dst_data.get_data_handle()), *fwd_stream);
+#else
       context_.data_mem_shdptr[i]->set_data_handle(
           static_cast<void*>(in_data[i].get_data_handle()));
     }
     context_.dst_mem->set_data_handle(
         static_cast<void*>(dst_data.get_data_handle()));
+#endif  // ENABLE_MKLDNN_THREADPOOL
 
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
       context_.data_mem[i] = *context_.data_mem_shdptr[i];
@@ -788,11 +796,13 @@ class MklConcatOp : public OpKernel {
                                     dnn_shape_dst);
           DCHECK(dst_tensor != nullptr) << "Output tensor pointer is NULL";
 
+          std::shared_ptr<stream> fwd_cpu_stream;
+          fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
+
           if (dnn_shape_dst.IsMklTensor())
             dst_md = dnn_shape_dst.GetMklLayout();
           dst.SetUsrMem(dst_md, dst_tensor);
-          std::shared_ptr<stream> fwd_cpu_stream;
-          fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
+          dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
 #ifdef ENABLE_MKLDNN_V1
           auto concat_op = concat(concat_pd);
           std::unordered_map<int, memory> net_args = {
@@ -830,9 +840,10 @@ class MklConcatOp : public OpKernel {
 
           dst_md = dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout()
                                                : dst_md;
-          dst.SetUsrMem(dst_md, dst_tensor);
           std::shared_ptr<stream> fwd_cpu_stream;
           fwd_cpu_stream.reset(CreateStream(context, concat_fwd->GetEngine()));
+          dst.SetUsrMem(dst_md, dst_tensor);
+          dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
           // Execute concat
           concat_fwd->Execute(srcs_mem, dst.GetOpMem(), concat_fwd_dims,
                               fwd_cpu_stream);
diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl_dequantize_op.cc
index 06570c1db1c..82d78250576 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op.cc
@@ -75,6 +75,9 @@ class MklDequantizeOp : public OpKernel {
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<float> dst(&cpu_engine);
 
+      std::shared_ptr<stream> reorder_stream;
+      reorder_stream.reset(CreateStream(ctx, cpu_engine));
+
       // If input is in MKL layout, then simply grab input layout; otherwise,
       // construct input TF layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
@@ -85,6 +88,7 @@ class MklDequantizeOp : public OpKernel {
               : memory::desc(src_dims, MklDnnType<T>(), MEMORY_FORMAT::nhwc);
 
       src.SetUsrMem(src_md, &src_tensor);
+      src.SetUsrMemDataHandle(&src_tensor, reorder_stream);
 
       Tensor* output_tensor = nullptr;
       MklDnnShape output_mkl_shape;
@@ -129,6 +133,7 @@ class MklDequantizeOp : public OpKernel {
       AllocateOutputSetMklShape(ctx, 0, &output_tensor, output_tf_shape,
                                 output_mkl_shape);
       dst.SetUsrMem(dst_md, output_tensor);
+      dst.SetUsrMemDataHandle(output_tensor, reorder_stream);
 
       // The quantization logic here for mode SCALED is similar to the logic
       // in QuantizeAndDequantizeV2 and QuantizeAndDequantizeV3.
@@ -155,8 +160,6 @@ class MklDequantizeOp : public OpKernel {
       // Also it does not define round_nearest (enum).
       attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
 #endif  // !ENABLE_MKLDNN_V1
-      std::shared_ptr<stream> reorder_stream;
-      reorder_stream.reset(CreateStream(ctx, cpu_engine));
       std::vector<primitive> net;
 
       // Create reorder primitive and then execute.
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index a11e7ebcbf5..3e512d0792b 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -137,6 +137,7 @@ class MklLRNOp : public OpKernel {
       // that input is in NHWC layout with Channel being the last dimension.
       src_dnn_data.SetUsrMem(src_md, &src_tensor);
       src_dnn_data.SetOpMemDesc(input_dims, MEMORY_FORMAT::nhwc);
+      src_dnn_data.SetUsrMemDataHandle(&src_tensor, fwd_stream_);
 
       // dst_dnn_data has the same shape as input.
       dst_dnn_data.SetUsrMem(src_md);
@@ -157,7 +158,7 @@ class MklLRNOp : public OpKernel {
                            &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       DCHECK(output_tensor != nullptr);
-      dst_dnn_data.SetUsrMemDataHandle(output_tensor);
+      dst_dnn_data.SetUsrMemDataHandle(output_tensor, fwd_stream_);
 
       // Handle workspace required for MKL-DNN.
       AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data);
@@ -393,6 +394,7 @@ class MklLRNGradOp : public OpKernel {
           orig_input_dnn_shape.GetSizesAsMklDnnDims();
       orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
       orig_input_dnn_data.SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
+      orig_input_dnn_data.SetUsrMemDataHandle(&orig_input_tensor, bwd_stream_);
 
       // output_dnn_data has the same shape as original input
       output_dnn_data.SetUsrMem(orig_input_md);
@@ -421,7 +423,7 @@ class MklLRNGradOp : public OpKernel {
                            orig_input_format, &output_tensor);
       OP_REQUIRES_OK(context, context->status());
       DCHECK(output_tensor != nullptr);
-      output_dnn_data.SetUsrMemDataHandle(output_tensor);
+      output_dnn_data.SetUsrMemDataHandle(output_tensor, bwd_stream_);
 
       // Create LRN primitive and add it to the net
       // At this point, workspace is enabled, so we don't need
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index 77a68afa752..2e5c6d2719b 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -137,6 +137,7 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     memory::dims out_strides =
         ReorderStrides(CalculateTFStrides(out_dims), perm);
 
+    std::shared_ptr<stream> transpose_stream;
     in.SetUsrMem(in_dims, in_strides, &in_tensor);
     // Output dimensions are same as input dimensions. We adjust the layout
     // using strides.
@@ -144,16 +145,16 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
 
     std::vector<primitive> net;
 #ifdef ENABLE_MKLDNN_V1
-    std::shared_ptr<stream> transpose_stream;
     auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
     transpose_stream.reset(CreateStream(context, prim->GetEngine()));
+    in.SetUsrMemDataHandle(&in_tensor, transpose_stream);
+    out.SetUsrMemDataHandle(out_tensor, transpose_stream);
     net.push_back(*(prim->GetPrimitive()));
     std::vector<MemoryArgsMap> net_args;
     net_args.push_back({{MKLDNN_ARG_FROM, *in.GetUsrMem()},
                         {MKLDNN_ARG_TO, *out.GetUsrMem()}});
     execute_primitives(net, transpose_stream, net_args);
 #else
-    std::shared_ptr<stream> transpose_stream;
     transpose_stream.reset(new CPU_STREAM(cpu_engine));
     net.push_back(FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem()));
     transpose_stream->submit(net).wait();
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 7f6272b09c1..996984eebc0 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1524,17 +1524,27 @@ class MklDnnData {
   }
 
   /// Set function for data buffer of user memory primitive.
-  inline void SetUsrMemDataHandle(void* data_buffer) {
+  inline void SetUsrMemDataHandle(void* data_buffer,
+                                  std::shared_ptr<stream> t_stream = nullptr) {
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(data_buffer);
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    user_memory_->set_data_handle(data_buffer, *t_stream);
+#else
     user_memory_->set_data_handle(data_buffer);
+#endif  // ENABLE_MKLDNN_THREADPOOL
   }
 
   /// Set function for data buffer of user memory primitive.
-  inline void SetUsrMemDataHandle(const Tensor* tensor) {
+  inline void SetUsrMemDataHandle(const Tensor* tensor,
+                                  std::shared_ptr<stream> t_stream = nullptr) {
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(tensor);
+#ifdef ENABLE_MKLDNN_THREADPOOL
+    user_memory_->set_data_handle(GetTensorBuffer(tensor), *t_stream);
+#else
     user_memory_->set_data_handle(GetTensorBuffer(tensor));
+#endif  // ENABLE_MKLDNN_THREADPOOL
   }
 
   /// allocate function for data buffer

From f3d3480c811480c2b966df1086828e47131eb783 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 17 Jun 2020 09:25:28 +0800
Subject: [PATCH 044/112] [tflite] make label_image build on linux and macOS

label_image doesn't build on Linux and macOS platforms

```
bazel build --config opt //tensorflow/lite/examples/label_image:label_image
```
shows something like
```
ERROR: /home/freedom/work/tensorflow/tensorflow/lite/examples/label_image/BUILD:15:1: undeclared inclusion(s) in rule '//tensorflow/lite/examples/label_image:label_image':
this rule is missing dependency declarations for the following files included by 'tensorflow/lite/examples/label_image/label_image.cc':
  'external/com_google_absl/absl/strings/string_view.h'
  'external/com_google_absl/absl/base/internal/throw_delegate.h'

```

Add `"@com_google_absl//absl/strings"` to deps
---
 tensorflow/lite/examples/label_image/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index 01296b0b2a0..633f767c5e9 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -38,6 +38,7 @@ cc_binary(
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools/evaluation:utils",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/gpu:delegate",

From eaf10e9a279e270d8133a41cfc9690de10423a1d Mon Sep 17 00:00:00 2001
From: Lutz Roeder <lutzroeder@users.noreply.github.com>
Date: Sun, 14 Jun 2020 23:02:59 -0700
Subject: [PATCH 045/112] Fix Keras documentation

---
 tensorflow/python/keras/layers/core.py     | 8 ++++++--
 tensorflow/python/keras/layers/wrappers.py | 9 +++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index abfb025db30..e64a1c27bcf 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -825,10 +825,14 @@ class Lambda(Layer):
       returned as output mask regardless of what the input is.
     arguments: Optional dictionary of keyword arguments to be passed to the
       function.
-  Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
+
+  Input shape:
+    Arbitrary. Use the keyword argument input_shape (tuple of
     integers, does not include the samples axis) when using this layer as the
     first layer in a model.
-  Output shape: Specified by `output_shape` argument
+
+  Output shape:
+    Specified by `output_shape` argument
   """
 
   @trackable.no_automatic_dependency_tracking
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 8fe3b3b20bb..23fef467cfe 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -341,10 +341,11 @@ class Bidirectional(Wrapper):
       combined. One of {'sum', 'mul', 'concat', 'ave', None}. If None, the
       outputs will not be combined, they will be returned as a list. Default
       value is 'concat'.
-    backward_layer: Optional `keras.layers.RNN`, or keras.layers.Layer` instance
-      to be used to handle backwards input processing. If `backward_layer` is
-      not provided, the layer instance passed as the `layer` argument will be
-      used to generate the backward layer automatically.
+    backward_layer: Optional `keras.layers.RNN`, or `keras.layers.Layer`
+      instance to be used to handle backwards input processing.
+      If `backward_layer` is not provided, the layer instance passed as the
+      `layer` argument will be used to generate the backward layer
+      automatically.
       Note that the provided `backward_layer` layer should have properties
       matching those of the `layer` argument, in particular it should have the
       same values for `stateful`, `return_states`, `return_sequence`, etc.

From 286cd7fc6839bb2fc999fd16fb1801f6b30656b8 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 17 Jun 2020 12:22:53 +0100
Subject: [PATCH 046/112] Addressed reviewer's comments.

Change-Id: I4b849e60540879ca89483ede675c63631bc9417b
---
 tensorflow/lite/python/lite.py | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 06796ba820b..cb2f1853619 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -299,32 +299,21 @@ class QuantizationMode(object):
 
     inference_input_type = input_ty if input_ty else constants.FLOAT
     inference_output_type = output_ty if output_ty else constants.FLOAT
-    if self.post_training_int8_no_float():
+
+    if self.post_training_int8_no_float() \
+      or self.post_training_int16x8_no_float():
       return True, {
           "inference_input_type": inference_input_type,
           "inference_output_type": inference_output_type,
-          "activations_type": constants.INT8,
+          "activations_type": self.activations_type(),
           "allow_float": False
       }
-    elif self.post_training_int8_allow_float():
+    elif self.post_training_int8_allow_float() \
+      or self.post_training_int16x8_allow_float():
       return True, {
           "inference_input_type": inference_input_type,
           "inference_output_type": inference_output_type,
-          "activations_type": constants.INT8,
-          "allow_float": True
-      }
-    elif self.post_training_int16x8_no_float():
-      return True, {
-          "inference_input_type": inference_input_type,
-          "inference_output_type": inference_output_type,
-          "activations_type": constants.INT16,
-          "allow_float": False
-      }
-    elif self.post_training_int16x8_allow_float():
-      return True, {
-          "inference_input_type": inference_input_type,
-          "inference_output_type": inference_output_type,
-          "activations_type": constants.INT16,
+          "activations_type": self.activations_type(),
           "allow_float": True
       }
     else:

From 97afd248868c7f28c197abde87cf610b550bdef9 Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Wed, 17 Jun 2020 10:00:23 -0700
Subject: [PATCH 047/112] Relu grad GPU uses 8 float16 element vector

---
 tensorflow/core/kernels/relu_op_gpu.cu.cc | 89 +++++++++++++++++++++--
 1 file changed, 82 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index 27fd5f64249..983cc127863 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -35,6 +35,7 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+static constexpr int VectorSizeElements = 8;
 namespace functor {
 
 // This kernel computes ReluGrad by processing one half2, two fp16, at a time.
@@ -93,6 +94,66 @@ __global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
   }
 }
 
+__global__ void ReluGradHalfKernelVector(
+    const Eigen::half* __restrict__ gradient,
+    const Eigen::half* __restrict__ feature,
+    Eigen::half* __restrict__ backprop, int32 count) {
+  int32 half8_count = count / VectorSizeElements;
+  int32 index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (index < half8_count) {
+    // Cast to xx_h8 for vector load and store.
+    float4 gradient_h8 = reinterpret_cast<const float4*>(gradient)[index];
+    float4 feature_h8 = reinterpret_cast<const float4*>(feature)[index];
+    float4* p_backprop_h8 = reinterpret_cast<float4*>(backprop) + index;
+
+    half2 *gradient_h2 = reinterpret_cast<half2*>(&gradient_h8);
+    half2 *feature_h2 = reinterpret_cast<half2*>(&feature_h8);
+    float4 backprop_h8;
+    half2* p_backprop_h2 = reinterpret_cast<half2*>(&backprop_h8);
+
+    // Fast path, when half2 primitives are available.
+#if __CUDA_ARCH__ >= 530
+    const half2 kZeroH2 = __float2half2_rn(0.f);
+#endif
+    for (int i = 0; i < VectorSizeElements / 2; i++) {
+#if __CUDA_ARCH__ >= 530
+      // mask = (feature > 0)
+      half2 mask_h2 = __hgt2(feature_h2[i], kZeroH2);
+      // backprop = mask * gradient
+      half2 backprop_h2 = __hmul2(mask_h2, gradient_h2[i]);
+#else
+      // Fall back: convert half2 to float2 for processing.
+      float2 feature_f2 = __half22float2(feature_h2[i]);
+      float2 gradient_f2 = __half22float2(gradient_h2[i]);
+      float2 backprop_f2 =
+          make_float2((feature_f2.x > 0.0f) ? float(gradient_f2.x) : 0.0f,
+                      (feature_f2.y > 0.0f) ? float(gradient_f2.y) : 0.0f);
+      // Convert back to half2.
+      half2 backprop_h2 = __float22half2_rn(backprop_f2);
+#endif
+      p_backprop_h2[i] = backprop_h2;
+    }
+    // Write back the result.
+    *p_backprop_h8 = backprop_h8;
+  }
+
+  int remaining_count = (count % VectorSizeElements);
+
+  if (index < remaining_count) {
+    // Use first threads to process the remaining elements.
+    Eigen::half grad_h = gradient[half8_count * VectorSizeElements + index];
+    Eigen::half feature_h = feature[half8_count * VectorSizeElements + index];
+
+    float grad_f = static_cast<float>(grad_h);
+    float feature_f = static_cast<float>(feature_h);
+    float backprop_f = (feature_f > 0) ? grad_f : 0;
+
+    Eigen::half backprop_h(backprop_f);
+    backprop[half8_count * VectorSizeElements + index] = backprop_h;
+  }
+}
+
 template <typename Device>
 struct ReluGrad<Device, Eigen::half> {
   // Computes ReluGrad backprop.
@@ -108,15 +169,29 @@ struct ReluGrad<Device, Eigen::half> {
     // NOTE: When the activation is exactly zero, we do not propagate the
     // associated gradient value. This allows the output of the Relu to be used,
     // as well as its input.
+    auto gradient_ptr = reinterpret_cast<uintptr_t>(gradient.data());
+    auto feature_ptr = reinterpret_cast<uintptr_t>(feature.data());
+    auto backprop_ptr = reinterpret_cast<uintptr_t>(backprop.data());
+    bool aligned = gradient_ptr % 16 == 0 && feature_ptr % 16 == 0 &&
+                   backprop_ptr % 16 == 0;
     int32 count = gradient.size();
-    if (count == 0) return;
-    int32 half2_count = Eigen::divup(count, 2);
     constexpr int32 kThreadInBlock = 512;
-    GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
-        half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
-    TF_CHECK_OK(GpuLaunchKernel(
-        ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
-        d.stream(), gradient.data(), feature.data(), backprop.data(), count));
+    if (count == 0) return;
+    if (aligned) {
+      int32 half8_count = Eigen::divup(count, VectorSizeElements);
+      int32 kBlock = Eigen::divup(half8_count, kThreadInBlock);
+      TF_CHECK_OK(GpuLaunchKernel(
+          ReluGradHalfKernelVector, kBlock, kThreadInBlock,
+          0, d.stream(), gradient.data(), feature.data(), backprop.data(),
+          count));
+    } else {
+      int32 half2_count = Eigen::divup(count, 2);
+      GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
+          half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
+      TF_CHECK_OK(GpuLaunchKernel(
+          ReluGradHalfKernel, config.block_count, config.thread_per_block, 0,
+          d.stream(), gradient.data(), feature.data(), backprop.data(), count));
+    }
   }
 };
 

From c65fccf0a6671c90599d0d3426dd18597688ea3a Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 17 Jun 2020 11:55:27 -0700
Subject: [PATCH 048/112] Format changes as per Google's feedback

---
 .../ci_build/linux/mkl/Dockerfile.devel-mkl   |  4 ++--
 .../ci_build/linux/mkl/build-dev-container.sh | 21 +++++++++----------
 .../linux/mkl/install_openmpi_horovod.sh      |  7 ++++---
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index a78d13c7755..1f80cba35f0 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -64,9 +64,9 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
 
 # Install OpenMPI/Horovod
 COPY install_openmpi_horovod.sh .
-RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
+RUN if [ "${ENABLE_HOROVOD}" == "yes" ]; then \
         chmod +x install_openmpi_horovod.sh && \
-        ./install_openmpi_horovod.sh OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} HOROVOD_VERSION=${HOROVOD_VERSION} && \
+        OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} HOROVOD_VERSION=${HOROVOD_VERSION} ./install_openmpi_horovod.sh && \
         rm -rf install_openmpi_horovod.sh; \
     fi
 
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index 6e789a54e87..a0880b0e51c 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -201,21 +201,20 @@ function test_container()
   debug "ID of the running docker container: ${CONTAINER_ID}"
 
   debug "Performing basic sanity checks on the running container..."
-  TEST_CMD_1=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import _pywrap_util_port; print(_pywrap_util_port.IsMklEnabled())'")
-  # Make TEST_CMD backward compatible with older code
-  TEST_CMD_2=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import pywrap_tensorflow; print(pywrap_tensorflow.IsMklEnabled())'")
-
-  if [ "${TEST_CMD_1}" = "True" -o "${TEST_CMD_2}" = "True" ] ; then
-      echo "PASS: MKL enabled test in ${TEMP_IMAGE_NAME}"
-  else
-      die "FAIL: MKL enabled test in ${TEMP_IMAGE_NAME}"
-  fi
+  {
+    ${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import _pywrap_util_port; print(_pywrap_util_port.IsMklEnabled())'"
+    echo "PASS: MKL enabled test in ${TEMP_IMAGE_NAME}"
+  } || {
+    ${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'from tensorflow.python import pywrap_tensorflow; print(pywrap_tensorflow.IsMklEnabled())'"
+    echo "PASS: Old MKL enabled in ${TEMP_IMAGE_NAME}"
+  } || {
+    die "FAIL: MKL enabled test in ${TEMP_IMAGE_NAME}"
+  }
 
   # Test to check if horovod is installed successfully
   if [[ ${ENABLE_HOROVOD} == "yes" ]]; then
       debug "Test horovod in the container..."
-      HOROVOD_TEST_CMD=$(${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'")
-      ${HOROVOD_TEST_CMD}
+      ${DOCKER_BINARY} exec ${CONTAINER_ID} bash -c "${PYTHON} -c 'import horovod.tensorflow as hvd;'"
       if [[ $? == "0" ]]; then
           echo "PASS: HOROVOD installation test in ${TEMP_IMAGE_NAME}"
       else
diff --git a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
index aec40543a17..9bc92ca4fef 100755
--- a/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/install_openmpi_horovod.sh
@@ -54,8 +54,9 @@ echo 'OpenMPI version:'
 mpirun --version
 
 # Install OpenSSH for MPI to communicate between containers
-apt-get clean && apt-get update && apt-get install -y --no-install-recommends --fix-missing \
-    openssh-client openssh-server libnuma-dev && \
+apt-get clean && apt-get update && \
+    apt-get install -y --no-install-recommends --fix-missing \
+        openssh-client openssh-server libnuma-dev && \
     rm -rf /var/lib/apt/lists/*
 if [[ $?  == "0" ]]; then
     echo "PASS: OpenSSH installation"
@@ -70,7 +71,7 @@ else
 fi
 mkdir -p /var/run/sshd
 # Allow OpenSSH to talk to containers without asking for confirmation
-cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new
+grep -v StrictHostKeyChecking /etc/ssh/ssh_config > /etc/ssh/ssh_config.new
 echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
 mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
 

From 55929ce6ae2bc9552526a038ef4d01d8bef4f4fd Mon Sep 17 00:00:00 2001
From: shwetaoj <shweta.ojha@intel.com>
Date: Wed, 17 Jun 2020 12:52:57 -0700
Subject: [PATCH 049/112] Reverting == in Dockerfile

---
 tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
index 1f80cba35f0..80091e55a17 100755
--- a/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
+++ b/tensorflow/tools/ci_build/linux/mkl/Dockerfile.devel-mkl
@@ -64,7 +64,7 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
 
 # Install OpenMPI/Horovod
 COPY install_openmpi_horovod.sh .
-RUN if [ "${ENABLE_HOROVOD}" == "yes" ]; then \
+RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
         chmod +x install_openmpi_horovod.sh && \
         OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} HOROVOD_VERSION=${HOROVOD_VERSION} ./install_openmpi_horovod.sh && \
         rm -rf install_openmpi_horovod.sh; \

From eda7d05793ec75227069eb0c3f49e0377f33c963 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 18 Jun 2020 23:02:21 +0700
Subject: [PATCH 050/112] Add NewWritableFile

---
 .../experimental/filesystem/plugins/gcs/BUILD |  1 +
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 50 +++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index c9fee433589..05fd371088c 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow:windows": get_win_copts(),
     }),
     deps = [
+        "//tensorflow/c:env",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 8c54bc85439..4ddc8548486 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -15,8 +15,11 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 
+#include <fstream>
+
 #include "absl/strings/string_view.h"
 #include "google/cloud/storage/client.h"
+#include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/tf_status.h"
 
@@ -75,6 +78,25 @@ static void ParseGCSPath(absl::string_view fname, bool object_empty_ok,
   strcpy(*object, object_view.data());
 }
 
+class TempFile : public std::fstream {
+ public:
+  // We should specify openmode each time we call TempFile.
+  TempFile(const char* temp_file_name, std::ios::openmode mode)
+      : std::fstream(temp_file_name, mode), name(temp_file_name) {}
+  TempFile(TempFile&& rhs) : std::fstream(std::move(rhs)), name(rhs.name) {
+    rhs.name = nullptr;
+  }
+  ~TempFile() {
+    std::fstream::close();
+    std::remove(name);
+    plugin_memory_free(const_cast<char*>(name));
+  }
+  const char* getName() { return name; }
+
+ private:
+  const char* name;
+};
+
 // SECTION 1. Implementation for `TF_RandomAccessFile`
 // ----------------------------------------------------------------------------
 namespace tf_random_access_file {
@@ -86,6 +108,20 @@ namespace tf_random_access_file {
 // SECTION 2. Implementation for `TF_WritableFile`
 // ----------------------------------------------------------------------------
 namespace tf_writable_file {
+typedef struct GCSFile {
+  const char* bucket;
+  const char* object;
+  gcs::Client* gcs_client;  // not owned
+  TempFile outfile;
+  bool sync_need;
+} GCSFile;
+
+static void Cleanup(TF_WritableFile* file) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  plugin_memory_free(const_cast<char*>(gcs_file->bucket));
+  plugin_memory_free(const_cast<char*>(gcs_file->object));
+  delete gcs_file;
+}
 
 // TODO(vnvo2409): Implement later
 
@@ -119,6 +155,20 @@ static void Init(TF_Filesystem* filesystem, TF_Status* status) {
 
 // TODO(vnvo2409): Implement later
 
+static void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                            TF_WritableFile* file, TF_Status* status) {
+  char* bucket;
+  char* object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  auto gcs_client = static_cast<gcs::Client*>(filesystem->plugin_filesystem);
+  TempFile outfile(TF_GetTempFileName(""), std::ios::binary | std::ios::out);
+  file->plugin_file = new tf_writable_file::GCSFile(
+      {bucket, object, gcs_client, std::move(outfile), true});
+  TF_SetStatus(status, TF_OK, "");
+}
+
 }  // namespace tf_gcs_filesystem
 
 static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,

From 88c63dccd1dacc7b7b5658bc200e872be4c32f5f Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 18 Jun 2020 23:46:41 +0700
Subject: [PATCH 051/112] Move TempFile  to gcs_helper

---
 .../experimental/filesystem/plugins/gcs/BUILD | 12 ++++++-
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 20 +----------
 .../filesystem/plugins/gcs/gcs_helper.cc      | 19 +++++++++++
 .../filesystem/plugins/gcs/gcs_helper.h       | 33 +++++++++++++++++++
 4 files changed, 64 insertions(+), 20 deletions(-)
 create mode 100644 tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
 create mode 100644 tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index 05fd371088c..d104181b264 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -24,10 +24,20 @@ cc_library(
         "//tensorflow:windows": get_win_copts(),
     }),
     deps = [
-        "//tensorflow/c:env",
+        ":gcs_helper",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "gcs_helper",
+    srcs = ["gcs_helper.cc"],
+    hdrs = ["gcs_helper.h"],
+    linkstatic = 1,
+    deps = [
+        "//tensorflow/c:env",
+    ],
+)
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index 4ddc8548486..2793194e0a8 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
 #include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for GCS environments.
@@ -78,25 +79,6 @@ static void ParseGCSPath(absl::string_view fname, bool object_empty_ok,
   strcpy(*object, object_view.data());
 }
 
-class TempFile : public std::fstream {
- public:
-  // We should specify openmode each time we call TempFile.
-  TempFile(const char* temp_file_name, std::ios::openmode mode)
-      : std::fstream(temp_file_name, mode), name(temp_file_name) {}
-  TempFile(TempFile&& rhs) : std::fstream(std::move(rhs)), name(rhs.name) {
-    rhs.name = nullptr;
-  }
-  ~TempFile() {
-    std::fstream::close();
-    std::remove(name);
-    plugin_memory_free(const_cast<char*>(name));
-  }
-  const char* getName() { return name; }
-
- private:
-  const char* name;
-};
-
 // SECTION 1. Implementation for `TF_RandomAccessFile`
 // ----------------------------------------------------------------------------
 namespace tf_random_access_file {
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
new file mode 100644
index 00000000000..139579c53ae
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.cc
@@ -0,0 +1,19 @@
+#include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
+
+#include <stdio.h>
+
+#include <fstream>
+#include <string>
+
+TempFile::TempFile(const char* temp_file_name, std::ios::openmode mode)
+    : std::fstream(temp_file_name, mode), name(temp_file_name) {}
+
+TempFile::TempFile(TempFile&& rhs)
+    : std::fstream(std::move(rhs)), name(std::move(rhs.name)) {}
+
+TempFile::~TempFile() {
+  std::fstream::close();
+  std::remove(name.c_str());
+}
+
+const std::string TempFile::getName() const { return name; }
\ No newline at end of file
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
new file mode 100644
index 00000000000..437cbe560d6
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
+#define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_
+
+#include <fstream>
+#include <string>
+
+class TempFile : public std::fstream {
+ public:
+  // We should specify openmode each time we call TempFile.
+  TempFile(const char* temp_file_name, std::ios::openmode mode);
+  TempFile(TempFile&& rhs);
+  ~TempFile();
+  const std::string getName() const;
+
+ private:
+  const std::string name;
+};
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_HELPER_H_

From ededf6f4b9d1488c1d27df58b047fd5da6ad6c73 Mon Sep 17 00:00:00 2001
From: Sharada Shiddibhavi <sharada.shiddibhavi@intel.com>
Date: Thu, 18 Jun 2020 13:14:37 -0700
Subject: [PATCH 052/112] Update tensorflow/core/util/mkl_util.h

Addressing review comments

Co-authored-by: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com>
---
 tensorflow/core/util/mkl_util.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 996984eebc0..854d6e349cb 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1538,13 +1538,8 @@ class MklDnnData {
   /// Set function for data buffer of user memory primitive.
   inline void SetUsrMemDataHandle(const Tensor* tensor,
                                   std::shared_ptr<stream> t_stream = nullptr) {
-    CHECK_NOTNULL(user_memory_);
-    CHECK_NOTNULL(tensor);
-#ifdef ENABLE_MKLDNN_THREADPOOL
-    user_memory_->set_data_handle(GetTensorBuffer(tensor), *t_stream);
-#else
-    user_memory_->set_data_handle(GetTensorBuffer(tensor));
-#endif  // ENABLE_MKLDNN_THREADPOOL
+    SetUsrMemDataHandle(GetTensorBuffer(tensor), t_stream);
+  }
   }
 
   /// allocate function for data buffer

From f8431d0c293a34c5bfb91cf0c57384eaa47a9911 Mon Sep 17 00:00:00 2001
From: Sharada Shiddibhavi <sharada.shiddibhavi@intel.com>
Date: Thu, 18 Jun 2020 13:29:48 -0700
Subject: [PATCH 053/112] Update mkl_util.h

---
 tensorflow/core/util/mkl_util.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 854d6e349cb..eb1a105e07c 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1540,7 +1540,6 @@ class MklDnnData {
                                   std::shared_ptr<stream> t_stream = nullptr) {
     SetUsrMemDataHandle(GetTensorBuffer(tensor), t_stream);
   }
-  }
 
   /// allocate function for data buffer
   inline void AllocateBuffer(size_t size) {

From 6558da5a66ad6863e47abfe596eee2290524b1b7 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Thu, 18 Jun 2020 13:30:08 -0700
Subject: [PATCH 054/112] Apply new TraceMe APIs.

PiperOrigin-RevId: 317169381
Change-Id: I2259895a8dde21e25661a239b9d4f5911a454adb
---
 tensorflow/compiler/xla/pjrt/BUILD               |  2 ++
 tensorflow/compiler/xla/pjrt/pjrt_client.cc      | 16 ++++++++--------
 tensorflow/core/profiler/lib/connected_traceme.h |  1 +
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index dd50d0577d4..e401a798d68 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -141,7 +141,9 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/core:allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:stream",
         "//tensorflow/stream_executor/host:host_platform_id",
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index ccb72b7ce30..ef259cf1cfd 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -98,7 +98,9 @@ limitations under the License.
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/event.h"
@@ -1429,10 +1431,9 @@ StatusOr<ScopedShapedBuffer> PjRtExecutable::EnqueueExecution(
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
     Device* device, std::vector<PjRtBuffer::ScopedHold>* device_buffers) const {
   int device_ordinal = device->local_device_state()->device_ordinal();
-  tensorflow::profiler::TraceMe traceme([&] {
-    return absl::StrCat("LocalExecutable::Execute#run_id=", run_id.ToInt(),
-                        "#");
-  });
+  tensorflow::profiler::TraceMeConsumer activity(
+      "LocalExecutable::Execute", tensorflow::profiler::ContextType::kPjRt,
+      run_id.ToInt());
   VLOG(3) << "Replica " << replica << ", partition " << partition
           << " mapped to device ordinal for execution: " << device_ordinal;
 
@@ -1721,10 +1722,9 @@ PjRtExecutable::ExecuteOnLocalDevices(
     absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
     const ExecuteOptions& options) const {
   RunId run_id;
-  tensorflow::profiler::TraceMe traceme([&] {
-    return absl::StrCat(
-        "LocalExecutable::ExecuteOnLocalDevices#run_id=", run_id.ToInt(), "#");
-  });
+  tensorflow::profiler::TraceMeProducer activity(
+      "LocalExecutable::ExecuteOnLocalDevices",
+      tensorflow::profiler::ContextType::kPjRt, run_id.ToInt());
 
   const int num_local_devices = local_devices_.size();
 
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
index ed8b4ac1ad2..b55c4407fe6 100644
--- a/tensorflow/core/profiler/lib/connected_traceme.h
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -29,6 +29,7 @@ enum class ContextType : int {
   kGeneric,
   kTfExecutor,
   kSharedBatchScheduler,
+  kPjRt,
 };
 
 /*

From e0962f4c374f4cbf78ad27fd2391f976c1a2050d Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Thu, 18 Jun 2020 13:35:31 -0700
Subject: [PATCH 055/112] Remove the old grouping rule for PJRT.

PiperOrigin-RevId: 317170383
Change-Id: I96973a2d2cd0ca1fc786bc7500deb6b4fedd0534
---
 tensorflow/core/profiler/utils/group_events.cc  | 5 +----
 tensorflow/core/profiler/utils/xplane_schema.cc | 4 ----
 tensorflow/core/profiler/utils/xplane_schema.h  | 3 ---
 3 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index be8dd506b0c..0772cff7b97 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -635,10 +635,7 @@ std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList() {
        {StatType::kStepId, StatType::kIterNum}},
       {HostEventType::kKernelLaunch,
        HostEventType::kKernelExecute,
-       {StatType::kCorrelationId}},
-      {HostEventType::kLocalExecutableExecuteOnLocalDevice,
-       HostEventType::kLocalExecutableExecute,
-       {StatType::kRunId}}};
+       {StatType::kCorrelationId}}};
   return connect_info_list;
 }
 
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index be53dcbdc01..5ca8326d72c 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -91,10 +91,6 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       {"WhileOp-StartBody", kWhileOpStartBody},
       {"ForOp", kForOp},
       {"PartitionedCallOp", kPartitionedCallOp},
-      // XLA related.
-      {"LocalExecutable::ExecuteOnLocalDevices",
-       kLocalExecutableExecuteOnLocalDevice},
-      {"LocalExecutable::Execute", kLocalExecutableExecute},
       // tf.data related.
       {"IteratorGetNextOp::DoCompute", kIteratorGetNextOp},
       {"IteratorGetNextAsOptionalOp::DoCompute", kIteratorGetNextAsOptionalOp},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index a31814cef06..41774deaa59 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -81,9 +81,6 @@ enum HostEventType {
   kWhileOpStartBody,
   kForOp,
   kPartitionedCallOp,
-  // XLA related.
-  kLocalExecutableExecuteOnLocalDevice,
-  kLocalExecutableExecute,
   // tf.data related.
   kIteratorGetNextOp,
   kIteratorGetNextAsOptionalOp,

From 8f700fb2e0da382f1e2e9630f56a7922a8799a59 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Thu, 18 Jun 2020 13:39:10 -0700
Subject: [PATCH 056/112] [XLA] Propagate memory spaces recursively inside
 nested fusions.

PiperOrigin-RevId: 317171110
Change-Id: I65004edb7498acb2f3b4238d9afbbb5d3930aab5
---
 .../xla/service/memory_space_propagation.cc   |  80 +++++++---
 .../xla/service/memory_space_propagation.h    |  11 +-
 .../service/memory_space_propagation_test.cc  | 148 ++++++++++++++++++
 3 files changed, 214 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.cc b/tensorflow/compiler/xla/service/memory_space_propagation.cc
index 80eb4017477..2eb15b14eaf 100644
--- a/tensorflow/compiler/xla/service/memory_space_propagation.cc
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.cc
@@ -29,36 +29,78 @@ StatusOr<bool> MemorySpacePropagation::Run(HloModule* module) {
         // Propagate the operand subshapes.
         for (int operand_idx = 0; operand_idx < instruction->operand_count();
              ++operand_idx) {
-          modified |=
-              PropagateSubshapes(instruction->operand(operand_idx)->shape(),
-                                 instruction->fused_parameter(operand_idx));
+          for (const ShapeUtil::IndexedShape& indexed_shape :
+               ShapeUtil::GetLeafShapes(
+                   instruction->operand(operand_idx)->shape())) {
+            int64 memory_space = indexed_shape.shape.layout().memory_space();
+            modified |= Propagate(indexed_shape.index,
+                                  instruction->fused_parameter(operand_idx),
+                                  memory_space);
+          }
         }
 
         // Propagate output subshapes.
-        modified |= PropagateSubshapes(instruction->shape(),
-                                       instruction->fused_expression_root());
+        for (const ShapeUtil::IndexedShape& indexed_shape :
+             ShapeUtil::GetLeafShapes(instruction->shape())) {
+          int64 memory_space = indexed_shape.shape.layout().memory_space();
+          modified |=
+              Propagate(indexed_shape.index,
+                        instruction->fused_expression_root(), memory_space);
+        }
       }
     }
   }
   return modified;
 }
 
-bool MemorySpacePropagation::PropagateSubshapes(
-    const Shape& caller_shape, const HloInstruction* callee_instruction) const {
+bool MemorySpacePropagation::Propagate(ShapeIndexView index,
+                                       const HloInstruction* callee_instruction,
+                                       int64 memory_space) const {
   bool modified = false;
-  for (const ShapeUtil::IndexedShape& indexed_shape :
-       ShapeUtil::GetLeafShapes(caller_shape)) {
-    int64 memory_space = indexed_shape.shape.layout().memory_space();
-    const HloValue& value = dataflow_analysis_->GetUniqueValueAt(
-        callee_instruction, indexed_shape.index);
+  const HloValue& value = dataflow_analysis_->GetUniqueValueAt(
+      callee_instruction, index.ToShapeIndex());
 
-    for (const HloPosition& position : value.positions()) {
-      Shape* shape = ShapeUtil::GetMutableSubshape(
-          position.instruction->mutable_shape(), position.index);
-      if (shape->layout().memory_space() != memory_space) {
-        shape->mutable_layout()->set_memory_space(memory_space);
-        modified = true;
-      }
+  for (const HloPosition& position : value.positions()) {
+    HloInstruction* instruction = position.instruction;
+    Shape* shape = ShapeUtil::GetMutableSubshape(instruction->mutable_shape(),
+                                                 position.index);
+    if (shape->layout().memory_space() == memory_space) {
+      continue;
+    }
+    shape->mutable_layout()->set_memory_space(memory_space);
+    modified = true;
+
+    // For fusion outputs, propagate the memory space to the fusion root.
+    if (instruction->opcode() == HloOpcode::kFusion) {
+      Propagate(position.index, instruction->fused_expression_root(),
+                memory_space);
+    }
+
+    const HloInstruction* parent_fusion =
+        instruction->parent()->FusionInstruction();
+    // For nested fusion roots, pop one level up and propagate the memory space
+    // to the output of the calling fusion instruction.
+    if (instruction == instruction->parent()->root_instruction() &&
+        parent_fusion->parent()->IsFusionComputation()) {
+      Propagate(position.index, parent_fusion, memory_space);
+    }
+
+    // For nested fusion parameters, pop one level up and propagate the memory
+    // space to the operand of the calling fusion instruction.
+    if (instruction->opcode() == HloOpcode::kParameter &&
+        parent_fusion->parent()->IsFusionComputation()) {
+      const HloInstruction* fusion_operand =
+          parent_fusion->operand(instruction->parameter_number());
+      Propagate(position.index, fusion_operand, memory_space);
+    }
+  }
+
+  for (const HloUse& use : value.uses()) {
+    // For fusion uses, propagate the memory space to the fusion parameter.
+    if (use.instruction->opcode() == HloOpcode::kFusion) {
+      modified |= Propagate(
+          use.operand_index,
+          use.instruction->fused_parameter(use.operand_number), memory_space);
     }
   }
   return modified;
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.h b/tensorflow/compiler/xla/service/memory_space_propagation.h
index 65a1dfd14a6..510e9e69f79 100644
--- a/tensorflow/compiler/xla/service/memory_space_propagation.h
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.h
@@ -31,12 +31,11 @@ class MemorySpacePropagation : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  // Given the caller shape (operand or output) and its corresponding
-  // insturction in the fused computation (parameter or root), propagates the
-  // memory space to all the subshapes in the callee side. Returns true if the
-  // module is modified.
-  bool PropagateSubshapes(const Shape& caller_shape,
-                          const HloInstruction* callee_instruction) const;
+  // Given the shape index (operand or output) and its corresponding instruction
+  // in the fused computation (parameter or root), propagates the memory space
+  // in the callee side. Returns true if the module is modified.
+  bool Propagate(ShapeIndexView index, const HloInstruction* callee_instruction,
+                 int64 memory_space) const;
 
   std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
 };
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation_test.cc b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
index 8d74958f6aa..de45af5a190 100644
--- a/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
@@ -199,5 +199,153 @@ TEST_F(MemorySpacePropagationTest, TupleOutput) {
   EXPECT_EQ(module->Hash(), ref->Hash());
 }
 
+TEST_F(MemorySpacePropagationTest, NestedInputFusion) {
+  // Tests propagating the memory space to nested fusions on the input side.
+  absl::string_view hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[3,2]{0,1:T(128)} parameter(0)
+    ROOT %bitcast = s32[6]{0:T(128)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[3,2]{0,1:T(128)} parameter(0)
+    %fusion.1 = s32[6]{0:T(128)} fusion(%param_0.1), kind=kLoop, calls=bitcast_fusion
+    ROOT %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %fusion.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[3,2]{0,1:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[3,2]{0,1:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[3,2]{0,1:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[3,2]{0,1:T(128)S(1)} parameter(0)
+    ROOT %bitcast = s32[6]{0:T(128)S(1)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[3,2]{0,1:T(128)S(1)} parameter(0)
+    %fusion.1 = s32[6]{0:T(128)S(1)} fusion(%param_0.1), kind=kLoop, calls=bitcast_fusion
+    ROOT %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)S(1)} %fusion.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[3,2]{0,1:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[3,2]{0,1:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[3,2]{0,1:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+TEST_F(MemorySpacePropagationTest, NestedOutputFusion) {
+  // Tests propagating the memory space to nested fusions on the output side.
+  absl::string_view hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[6]{0:T(128)} parameter(0)
+    ROOT %bitcast = s32[3,2]{0,1:T(128)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    ROOT %fusion.1 = s32[3,2]{0,1:T(128)} fusion(%add.0), kind=kLoop, calls=bitcast_fusion
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[3,2]{0,1:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[3,2]{0,1:T(128)} copy(%fusion)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule NestedFusion
+
+  %bitcast_fusion {
+    %bf_param = s32[6]{0:T(128)S(1)} parameter(0)
+    ROOT %bitcast = s32[3,2]{0,1:T(128)S(1)} bitcast(%bf_param)
+  }
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
+    %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)S(1)} %param_0.1)
+    ROOT %fusion.1 = s32[3,2]{0,1:T(128)S(1)} fusion(%add.0), kind=kLoop, calls=bitcast_fusion
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[3,2]{0,1:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[3,2]{0,1:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
 }  // namespace
 }  // namespace xla

From fc5151130813140eb4189f77dd3a759c4077836d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 13:41:07 -0700
Subject: [PATCH 057/112] Delete orphaned comment

The comment for `loader_spec_` stuck around even after it was removed.

PiperOrigin-RevId: 317171521
Change-Id: Iddb6029fdad9cd5ef33bc4f4ea2653caed305658
---
 tensorflow/compiler/xla/service/gpu/kernel_thunk.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 88351881f3a..25acabb239b 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -77,8 +77,6 @@ class KernelThunk : public Thunk {
   // Will be set by IrEmitterUnnested.
   LaunchDimensions launch_dimensions_;
 
-  // Describes how to load this kernel. ExecuteOnStream reuses this loader
-  // specification for all executions.
   mutable tensorflow::mutex mutex_;
 
   // Loaded kernels for each `StreamExecutor`.  Requires pointer stability of

From c41f4652b45bf70f20686e612b41574b4b8139d7 Mon Sep 17 00:00:00 2001
From: Marissa Ikonomidis <marissaw@google.com>
Date: Thu, 18 Jun 2020 13:52:21 -0700
Subject: [PATCH 058/112] Add an option to enable MLIR bridge for tpu_py_test
 rule

If enable_mlir_bridge is True, a new test will be generated that runs with the MLIR bridge enabled.
This option is off by default.

PiperOrigin-RevId: 317173675
Change-Id: I332e1ae24cf82fceea20fd0aff2cec7c9b236a24
---
 tensorflow/core/platform/default/distribute.bzl | 3 +++
 tensorflow/python/framework/test_util.py        | 3 +++
 tensorflow/python/tpu/tpu.bzl                   | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/tensorflow/core/platform/default/distribute.bzl b/tensorflow/core/platform/default/distribute.bzl
index 46a5d826a79..b16d5e8cff7 100644
--- a/tensorflow/core/platform/default/distribute.bzl
+++ b/tensorflow/core/platform/default/distribute.bzl
@@ -22,6 +22,7 @@ def distribute_py_test(
         full_precision = False,
         disable_v2 = False,
         disable_v3 = False,
+        disable_mlir_bridge = True,
         **kwargs):
     """Generates py_test targets for CPU and GPU.
 
@@ -40,6 +41,7 @@ def distribute_py_test(
         full_precision: unused.
         disable_v2: whether tests for TPU version 2 should be generated.
         disable_v3: whether tests for TPU version 3 should be generated.
+        disable_mlir_bridge: whether to also run this with the mlir bridge enabled.
         **kwargs: extra keyword arguments to the non-tpu test.
     """
 
@@ -77,6 +79,7 @@ def distribute_py_test(
             tags = tpu_tags,
             disable_v2 = disable_v2,
             disable_v3 = disable_v3,
+            disable_mlir_bridge = disable_mlir_bridge,
         )
 
 register_extension_info(
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index a46bb7c9bda..8ddbcf34f3b 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1933,6 +1933,9 @@ class TensorFlowTestCase(googletest.TestCase):
       # disable it here.
       pywrap_tf_session.TF_SetXlaConstantFoldingDisabled(True)
 
+    if is_mlir_bridge_enabled():
+      context.context().enable_mlir_bridge = True
+
     self._threads = []
     self._tempdir = None
     self._cached_session = None
diff --git a/tensorflow/python/tpu/tpu.bzl b/tensorflow/python/tpu/tpu.bzl
index 5453702d64d..3c26d9b49bf 100644
--- a/tensorflow/python/tpu/tpu.bzl
+++ b/tensorflow/python/tpu/tpu.bzl
@@ -25,6 +25,7 @@ def tpu_py_test(
         disable_v2 = False,
         disable_v3 = False,
         disable_experimental = False,
+        disable_mlir_bridge = True,
         args = [],
         **kwargs):
     """Generates identical unit test variants for various Cloud TPU versions.
@@ -37,6 +38,7 @@ def tpu_py_test(
         disable_v2: If true, don't generate TPU v2 tests.
         disable_v3: If true, don't generate TPU v3 tests.
         disable_experimental: Unused.
+        disable_mlir_bridge: Unused.
         args: Arguments to apply to tests.
         **kwargs: Additional named arguments to apply to tests.
     """

From ef52b4e0886b7212471462643e92e98bea0253be Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 18 Jun 2020 13:53:10 -0700
Subject: [PATCH 059/112] [XLA/Client] Implement LocalClient::Run which
 supports buffer donation

PiperOrigin-RevId: 317173848
Change-Id: If92955ac5051376fbf0932b773b675459497c0c4
---
 .../compiler/xla/client/local_client.cc       | 47 +++++++++++++++----
 tensorflow/compiler/xla/client/local_client.h |  9 ++++
 .../tests/multiple_devices_on_host_test.cc    |  3 +-
 tensorflow/compiler/xla/tests/while_test.cc   |  6 ++-
 4 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index afe115deda8..f71e8a2d56d 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -168,6 +168,26 @@ LocalExecutable::RunHelper(const absl::Span<const Shape* const> argument_shapes,
   return std::make_pair(service_options, std::move(stream));
 }
 
+StatusOr<ExecutableRunOptions> LocalExecutable::GetExecutableRunOptions(
+    absl::Span<Shape const* const> argument_shapes,
+    const ExecutableRunOptions& run_options) {
+  TF_ASSIGN_OR_RETURN(auto options_and_stream,
+                      RunHelper(argument_shapes, run_options));
+  ExecutableRunOptions options = options_and_stream.first.run_options();
+  options.set_device_ordinal(-1);
+  return options;
+}
+
+template <typename T>
+static StatusOr<T> BlockHostUntilDoneAfterAsyncCall(
+    se::Stream* stream, std::function<StatusOr<T>()> async_callback) {
+  StatusOr<T> result = async_callback();
+  Status block_status = stream->BlockHostUntilDone();
+  TF_RETURN_IF_ERROR(result.status());
+  TF_RETURN_IF_ERROR(block_status);
+  return result;
+}
+
 StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutableRunOptions run_options) {
@@ -176,15 +196,24 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   for (const ShapedBuffer* const arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
   }
-  TF_ASSIGN_OR_RETURN(auto options_and_stream,
-                      RunHelper(argument_shapes, run_options));
-  ExecutableRunOptions options = options_and_stream.first.run_options();
-  options.set_device_ordinal(-1);
-  auto result = RunAsync(arguments, options);
-  Status block_status = options.stream()->BlockHostUntilDone();
-  TF_RETURN_IF_ERROR(result.status());
-  TF_RETURN_IF_ERROR(block_status);
-  return result;
+  TF_ASSIGN_OR_RETURN(ExecutableRunOptions options,
+                      GetExecutableRunOptions(argument_shapes, run_options));
+  return BlockHostUntilDoneAfterAsyncCall<xla::ScopedShapedBuffer>(
+      options.stream(), [&] { return RunAsync(arguments, options); });
+}
+
+StatusOr<ExecutionOutput> LocalExecutable::Run(
+    std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
+  std::vector<const Shape*> argument_shapes;
+  argument_shapes.reserve(arguments.size());
+  for (const ExecutionInput& arg : arguments) {
+    argument_shapes.push_back(&arg.shape());
+  }
+  TF_ASSIGN_OR_RETURN(ExecutableRunOptions options,
+                      GetExecutableRunOptions(argument_shapes, run_options));
+  return BlockHostUntilDoneAfterAsyncCall<ExecutionOutput>(
+      options.stream(),
+      [&] { return RunAsync(argument_shapes, std::move(arguments), options); });
 }
 
 static std::shared_ptr<HloSnapshot> DumpArguments(
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 7cdeb9dcbf6..b00f5cc6801 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -51,6 +51,11 @@ class LocalExecutable {
       const absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
+  // Similar to Run(), but allows for donating argument buffers to the
+  // executable.
+  StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
+                                ExecutableRunOptions run_options);
+
   // Similar to Run(), but need not block the host waiting for the computation
   // to complete before returning.
   StatusOr<ScopedShapedBuffer> RunAsync(
@@ -85,6 +90,10 @@ class LocalExecutable {
       const absl::Span<const Shape* const> argument_shapes,
       ExecutableRunOptions run_options);
 
+  StatusOr<ExecutableRunOptions> GetExecutableRunOptions(
+      absl::Span<Shape const* const> argument_shapes,
+      const ExecutableRunOptions& run_options);
+
   // The ordinal of the device which this executable was compiled for. The
   // executable can run on all equivalent devices (as determined by
   // Backend::devices_equivalent).
diff --git a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
index 2b19aaded9c..2231fc6feab 100644
--- a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
+++ b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
@@ -45,7 +45,8 @@ void CompileAndExecute(
       xla::ClientLibrary::GetXlaService(client->platform())
           ->backend()
           .memory_allocator());
-  StatusOr<ScopedShapedBuffer> result = executable->Run({}, execute_options);
+  StatusOr<ScopedShapedBuffer> result =
+      executable->Run(absl::Span<const ShapedBuffer* const>(), execute_options);
   {
     absl::MutexLock lock(results_mutex);
     results->emplace_back(device_ordinal, std::move(result));
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index d575bbb1f3e..8e8c3605cc7 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1324,14 +1324,16 @@ void BM_WhileLoop(int num_iters) {
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({}, options);
+    auto result =
+        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({}, options);
+    auto result =
+        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
     ASSERT_TRUE(result.ok());
   }
 }

From bc1c0e86a677d9b1e5d3e3f0da85c445c2a7efe2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 14:24:06 -0700
Subject: [PATCH 060/112] Wrap save/restore logic in tf.function when in eager
 mode. This allows parallel saving and restoring when using multiple devices.

PiperOrigin-RevId: 317180143
Change-Id: Icdc740d02beb7c2d3236191add3b72fa103fc134
---
 .../grappler/optimizers/function_optimizer.cc |   8 +-
 .../parallel_device/parallel_device_test.py   |   4 -
 .../python/framework/auto_control_deps.py     |   2 +-
 tensorflow/python/training/saving/BUILD       |   1 -
 .../training/saving/functional_saver.py       | 111 ++++++------------
 .../training/saving/functional_saver_test.py  |  17 +--
 6 files changed, 42 insertions(+), 101 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 0e156aaa84c..a66e645e04b 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -837,6 +837,7 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        "ParameterizedTruncatedNormal", "TruncatedNormal", "RandomShuffle",
        "Multinomial", "RandomGamma", "RandomGammaGrad", "RandomPoisson",
        "RandomPoissonV2",
+       // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
 
        // ReadVariableOp marked as stateful because it consumes DT_RESOURCE,
        // but it can't generate any observable side-effect.
@@ -850,12 +851,7 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
        // the same device_ordinal on the same host.
        "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
        "EnqueueTPUEmbeddingSparseTensorBatch",
-       "EnqueueTPUEmbeddingRaggedTensorBatch",
-
-       // SaveV2 and RestoreV2 should be allowed to operate in parallel on
-       // multiple hosts.
-       "SaveV2", "RestoreV2"});
-  // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py)
+       "EnqueueTPUEmbeddingRaggedTensorBatch"});
   return exemption->contains(op);
 }
 
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 1429c522aba..8fc3dcb5816 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -172,8 +172,6 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       config.set_synchronous_execution(previous)
 
   def test_checkpointing(self):
-    self.skipTest(
-        "Disable saving until SaveableObject's methods are traceable.")
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     with self.device.scope():
       different_values = self.device.pack(
@@ -265,8 +263,6 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], final_kernels[1].backing_device)
 
   def test_training_loop(self):
-    self.skipTest(
-        "Disable saving until SaveableObject's methods are traceable.")
     for _ in range(5):
       layer = _Dense(5)
       checkpoint = tracking.Checkpoint(layer=layer)
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 4b47735e0bf..51dcb248b11 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -100,7 +100,7 @@ _ORDER_INSENSITIVE_STATEFUL_OPS = [
     "CudnnRNNV2", "CudnnRNNV3", "CudnnRNNBackpropV2", "CudnnRNNBackpropV3",
     "EnqueueTPUEmbeddingSparseBatch", "EnqueueTPUEmbeddingIntegerBatch",
     "EnqueueTPUEmbeddingSparseTensorBatch",
-    "EnqueueTPUEmbeddingRaggedTensorBatch", "RestoreV2", "SaveV2"
+    "EnqueueTPUEmbeddingRaggedTensorBatch"
 ]
 # LINT.ThenChange(//tensorflow/core/grappler/optimizers/function_optimizer.cc)
 
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 12940840309..670a4c35c6f 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -43,7 +43,6 @@ cuda_py_test(
         ":checkpoint_options",
         ":functional_saver",
         ":saveable_hook",
-        "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
     ],
 )
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index 3a9b565470d..c4334e096df 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import uuid
 
 from tensorflow.core.protobuf import saver_pb2
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -162,8 +161,7 @@ class MultiDeviceSaver(object):
         self._after_restore_callbacks.append(saveable.after_restore)
 
       if is_saveable:
-        host_device = saveable_object_util.set_cpu0(saveable.device)
-        saveables_by_device.setdefault(host_device, []).append(saveable)
+        saveables_by_device.setdefault(saveable.device, []).append(saveable)
 
     self._single_device_savers = {
         device: _SingleDeviceSaver(saveables)
@@ -249,50 +247,33 @@ class MultiDeviceSaver(object):
       tmp_checkpoint_prefix = string_ops.string_join(
           [file_prefix, sharded_suffix])
 
-    def save_fn():
-      num_shards = len(self._single_device_savers)
-      sharded_saves = []
-      sharded_prefixes = []
-      num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
-      last_device = None
-      for shard, (device, saver) in enumerate(
-          sorted(self._single_device_savers.items())):
-        last_device = device
-        with ops.device(saveable_object_util.set_cpu0(device)):
-          shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
-                                          num_shards_tensor)
-        sharded_prefixes.append(shard_prefix)
-        with ops.device(device):
-          # _SingleDeviceSaver will use the CPU device when necessary, but
-          # initial read operations should be placed on the SaveableObject's
-          # device.
-          sharded_saves.append(saver.save(shard_prefix, options))
+    num_shards = len(self._single_device_savers)
+    sharded_saves = []
+    sharded_prefixes = []
+    num_shards_tensor = constant_op.constant(num_shards, name="num_shards")
+    last_device = None
+    for shard, (device, saver) in enumerate(
+        sorted(self._single_device_savers.items())):
+      last_device = device
+      with ops.device(saveable_object_util.set_cpu0(device)):
+        shard_prefix = sharded_filename(tmp_checkpoint_prefix, shard,
+                                        num_shards_tensor)
+      sharded_prefixes.append(shard_prefix)
+      with ops.device(device):
+        # _SingleDeviceSaver will use the CPU device when necessary, but initial
+        # read operations should be placed on the SaveableObject's device.
+        sharded_saves.append(saver.save(shard_prefix, options))
 
-      with ops.control_dependencies(sharded_saves):
-        # Merge on the io_device if specified, otherwise co-locates the merge op
-        # with the last device used.
-        merge_device = (
-            options.experimental_io_device or
-            saveable_object_util.set_cpu0(last_device))
-        with ops.device(merge_device):
-          # V2 format write path consists of a metadata merge step.  Once
-          # merged, attempts to delete the temporary directory,
-          # "<user-fed prefix>_temp".
-          return gen_io_ops.merge_v2_checkpoints(
-              sharded_prefixes, file_prefix, delete_old_dirs=True)
-
-    # Since this will causes a function re-trace on each save, limit this to the
-    # cases where it is needed: eager and when there are multiple tasks/single
-    # device savers. Note that the retrace is needed to ensure we pickup the
-    # latest values of options like experimental_io_device.
-    if context.executing_eagerly() and len(self._single_device_savers) > 1:
-      # Explicitly place the identity op on the first device.
-      @def_function.function(experimental_compile=False)
-      def tf_function_save():
-        save_fn()
-      tf_function_save()
-    else:
-      return save_fn()
+    with ops.control_dependencies(sharded_saves):
+      # Merge on the io_device if specified, otherwise co-locates the merge op
+      # with the last device used.
+      merge_device = (options.experimental_io_device or
+                      saveable_object_util.set_cpu0(last_device))
+      with ops.device(merge_device):
+        # V2 format write path consists of a metadata merge step.  Once merged,
+        # attempts to delete the temporary directory, "<user-fed prefix>_temp".
+        return gen_io_ops.merge_v2_checkpoints(
+            sharded_prefixes, file_prefix, delete_old_dirs=True)
 
   def restore(self, file_prefix, options=None):
     """Restore the saveable objects from a checkpoint with `file_prefix`.
@@ -306,38 +287,12 @@ class MultiDeviceSaver(object):
       A dictionary mapping from SaveableObject names to restore operations.
     """
     options = options or checkpoint_options.CheckpointOptions()
-
-    def restore_fn():
-      restore_ops = {}
-      # Sort by device name to avoid propagating non-deterministic dictionary
-      # ordering in some Python versions.
-      for device, saver in sorted(self._single_device_savers.items()):
-        with ops.device(device):
-          restore_ops.update(saver.restore(file_prefix, options))
-
-      return restore_ops
-
-    # Since this will causes a function re-trace on each save, limit this to the
-    # cases where it is needed: eager and when there are multiple tasks/single
-    # device savers. Note that the retrace is needed to ensure we pickup the
-    # latest values of options like experimental_io_device.
-    if context.executing_eagerly() and len(self._single_device_savers) > 1:
-      first_device, _ = list(self._single_device_savers.items())[0]
-      @def_function.function(experimental_compile=False)
-      def tf_function_restore():
-        restore_ops = restore_fn()
-        restore_tensors = {}
-        # tf.functions must return tensors, thus we use control dependencies so
-        # that we can return a tensor which depends on the given op.
-        with ops.device(saveable_object_util.set_cpu0(first_device)):
-          for name, op in restore_ops.items():
-            with ops.control_dependencies([op]):
-              restore_tensors[name] = array_ops.identity(file_prefix)
-        return restore_tensors
-
-      restore_ops = tf_function_restore()
-    else:
-      restore_ops = restore_fn()
+    restore_ops = {}
+    # Sort by device name to avoid propagating non-deterministic dictionary
+    # ordering in some Python versions.
+    for device, saver in sorted(self._single_device_savers.items()):
+      with ops.device(device):
+        restore_ops.update(saver.restore(file_prefix, options))
 
     for callback in self._after_restore_callbacks:
       callback()
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
index 8f3eef4fb9c..7db32ff72d7 100644
--- a/tensorflow/python/training/saving/functional_saver_test.py
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import config
@@ -30,7 +29,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import gfile
-from tensorflow.python.training import server_lib
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.saving import functional_saver
 from tensorflow.python.training.saving import saveable_hook
@@ -128,16 +126,13 @@ class SaverTest(test.TestCase):
     second_saver.restore(save_path)
     self.assertEqual(2., self.evaluate(v2))
 
-  def test_checkpoint_is_sharded_by_task(self):
-    servers = [server_lib.Server.create_local_server() for _ in range(3)]
-    cluster_spec = server_lib.ClusterSpec({
-        "worker": [s.target[len("grpc://"):] for s in servers]})
-    remote.connect_to_cluster(cluster_spec)
-    with ops.device("/job:worker/task:0/cpu:0"):
+  @test_util.run_in_graph_and_eager_modes
+  def test_checkpoint_is_sharded_by_device(self):
+    with ops.device("cpu:0"):
       v0 = resource_variable_ops.ResourceVariable(0.)
-    with ops.device("/job:worker/task:1/cpu:0"):
+    with ops.device("cpu:1"):
       v1 = resource_variable_ops.ResourceVariable(1.)
-    with ops.device("/job:worker/task:2/cpu:0"):
+    with ops.device("cpu:2"):
       v2 = resource_variable_ops.ResourceVariable(2.)
 
     self.evaluate([v0.initializer, v1.initializer, v2.initializer])
@@ -172,7 +167,7 @@ class SaverTest(test.TestCase):
         list(saveable_object_util.saveable_objects_for_op(v2, "v2")))
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(saver.save(constant_op.constant(prefix), self.local_options))
-    self.assertEqual(2, len(gfile.Glob(prefix + "*")))
+    self.assertEqual(4, len(gfile.Glob(prefix + "*")))
     self.evaluate(v0.assign(-1.))
     self.evaluate(v1.assign(-1.))
     self.evaluate(v2.assign(-1.))

From cb6e1ed5d8a406861398c428ca5fd6b84b439357 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Thu, 18 Jun 2020 14:28:42 -0700
Subject: [PATCH 061/112] Return `debug_string` when creating
 CompilationCacheKey.

PiperOrigin-RevId: 317181056
Change-Id: I02198244c1c3749ff1ecf4e0647b8daa80dd868c
---
 tensorflow/core/tpu/kernels/BUILD             |  16 ++
 .../kernels/tpu_compilation_cache_external.cc | 127 ---------------
 .../kernels/tpu_compilation_cache_external.h  |   8 -
 .../core/tpu/kernels/tpu_compile_c_api.h      |  19 ++-
 tensorflow/core/tpu/kernels/tpu_op_util.cc    | 151 ++++++++++++++++++
 tensorflow/core/tpu/kernels/tpu_op_util.h     |  40 +++++
 6 files changed, 223 insertions(+), 138 deletions(-)
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_util.cc
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_util.h

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 94d3c8edf2b..9d38eb71f3c 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -405,6 +405,22 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "tpu_op_util",
+    srcs = ["tpu_op_util.cc"],
+    hdrs = ["tpu_op_util.h"],
+    deps = [
+        ":tpu_compilation_cache_key",
+        ":tpu_compile_c_api_hdrs",
+        ":tpu_mesh_state_interface",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "tpu_util",
     srcs = ["tpu_util.cc"],
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index 8cee90e8e55..c4442fc95d5 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -49,70 +49,6 @@ void PopulateEntry(const std::string& key, CompiledSubgraph* entry,
       absl::make_unique<TpuProgramGroup>(std::move(tpu_program_group));
   entry->initialized = true;
 }
-
-// Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
-// data to compute the fingerprint.
-std::string GuaranteedConstFingerprint(
-    const string& fingerprint_in_metadata,
-    const OpInputList& guaranteed_constants) {
-  if (fingerprint_in_metadata.empty()) {
-    uint64_t fingerprint = 0;
-    for (const auto& constant : guaranteed_constants) {
-      fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
-          fingerprint, constant.tensor_data().data(),
-          constant.tensor_data().size());
-    }
-    return std::to_string(fingerprint);
-  } else {
-    return fingerprint_in_metadata;
-  }
-}
-
-std::string CreateShapePrefix(
-    const std::vector<tensorflow::TensorShape>& dynamic_shapes) {
-  std::string shapes_prefix;
-  for (const TensorShape& shape : dynamic_shapes) {
-    for (int64 size : shape.dim_sizes()) {
-      absl::StrAppend(&shapes_prefix, size, ",");
-    }
-    absl::StrAppend(&shapes_prefix, ";");
-  }
-  return shapes_prefix;
-}
-
-// Include compilation configurations of the arguments that are not captured
-// by the called graph.
-std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
-  std::string config_prefix;
-  for (const auto& arg : metadata.args()) {
-    if (arg.is_same_data_across_replicas()) {
-      absl::StrAppend(&config_prefix, ":s");
-      // Same.
-    } else {
-      // Different.
-      absl::StrAppend(&config_prefix, ":");
-    }
-    if (arg.enable_xla_sharding() ==
-        tpu::TPUCompileMetadataProto::Arg::ALLOWED) {
-      // Enabled.
-      absl::StrAppend(&config_prefix, "e");
-    }
-    if (arg.unrestricted_layout()) {
-      // Unrestricted.
-      absl::StrAppend(&config_prefix, ":u");
-    }
-    absl::StrAppend(&config_prefix, ",type(", arg.dtype(), ")");
-    if (arg.has_shape()) {
-      absl::StrAppend(&config_prefix, ",shape(");
-      for (const auto& dim : arg.shape().dim()) {
-        absl::StrAppend(&config_prefix, dim.size(), ",");
-      }
-      absl::StrAppend(&config_prefix, ")");
-    }
-  }
-  return config_prefix;
-}
-
 }  // namespace
 
 TpuCompilationCacheExternal::EntryRefImpl::EntryRefImpl(
@@ -196,68 +132,5 @@ CompiledSubgraph* TpuCompilationCacheExternal::InitializeEntry(
   marked_for_eviction_size_ += main_entry->total_size;
   return main_entry;
 }
-
-/*static*/ TpuCompilationCacheKey
-TpuCompilationCacheExternal::CreateCompilationCacheKey(
-    absl::string_view function_name, uint64 function_library_fingerprint,
-    absl::string_view mlir_module,
-    const tensorflow::OpInputList& guaranteed_constants,
-    const std::vector<tensorflow::TensorShape>& dynamic_shapes,
-    const tensorflow::tpu::TPUCompileMetadataProto& metadata,
-    const TpuMeshStateInterface& mesh_state) {
-  VLOG(1) << "FunctionLibraryFingerprint:" << function_library_fingerprint;
-  std::string shapes_prefix = CreateShapePrefix(dynamic_shapes);
-  VLOG(1) << "shapes_prefix = " << shapes_prefix;
-  std::string config_prefix = CreateConfigPrefix(metadata);
-  VLOG(1) << "config_prefix = " << config_prefix;
-  std::vector<int32_t> flattened_device_ids;
-  if (metadata.has_device_assignment()) {
-    for (const auto& device :
-         metadata.device_assignment().computation_devices()) {
-      flattened_device_ids.insert(flattened_device_ids.end(),
-                                  device.replica_device_ids().begin(),
-                                  device.replica_device_ids().end());
-    }
-  }
-  // TODO(henrytan): return the debug_string.
-  const char* prefix =
-      TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty{
-          config_prefix.data(),
-          shapes_prefix.data(),
-          function_name.data(),
-          mlir_module.data(),
-          flattened_device_ids.data(),
-          flattened_device_ids.size(),
-          guaranteed_constants.size(),
-          function_library_fingerprint,
-          metadata.num_cores_per_replica(),
-          metadata.num_replicas(),
-          mesh_state.data(),
-      });
-  auto buffer_cleanup = gtl::MakeCleanup([prefix]() { delete[] prefix; });
-  TpuCompilationCacheKey key;
-  key.prefix = prefix;
-
-  // Guaranteed constants can be different across sessions. Use session_handle
-  // and guaranteed_const fingerprint to guarantee no collision.
-  if (guaranteed_constants.size() > 0) {
-    key.has_guaranteed_const = true;
-    key.session_handle = metadata.session_handle();
-    // Both `metadata` and `guaranteed_constants` lifetime are captured by
-    // reference based on the assumption that these variables lifetime is
-    // managed through the `TPUCompileOpKernelImpl` that outlives the
-    // lifetime of the compilation cache lookups.
-    string fingerprint;
-    key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
-                                        fingerprint]() mutable {
-      if (fingerprint.empty()) {
-        fingerprint = GuaranteedConstFingerprint(
-            metadata.guaranteed_const_fingerprint(), guaranteed_constants);
-      }
-      return fingerprint;
-    };
-  }
-  return key;
-}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
index 2c75cb4d053..fe251326a43 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -63,14 +63,6 @@ class TpuCompilationCacheExternal : public TpuCompilationCacheInterface {
   explicit TpuCompilationCacheExternal(int64 max_cache_size)
       : TpuCompilationCacheInterface(max_cache_size) {}
 
-  static TpuCompilationCacheKey CreateCompilationCacheKey(
-      absl::string_view function_name, uint64 function_library_fingerprint,
-      absl::string_view mlir_module,
-      const tensorflow::OpInputList& guaranteed_constants,
-      const std::vector<tensorflow::TensorShape>& dynamic_shapes,
-      const tensorflow::tpu::TPUCompileMetadataProto& metadata,
-      const TpuMeshStateInterface& mesh_state);
-
   string DebugString() const override { return "TpuCompilationCacheExternal"; }
 
  private:
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index d1546ed9610..c101e489d56 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -42,6 +42,13 @@ struct CompilationCacheKeyProperty {
   const XLA_TpuMeshState* mesh_state;
 };
 
+// Compilation cache key result returning both the key and a more verbose debug
+// version.
+struct CompilationCacheKeyResult {
+  const char* key;
+  const char* debug_string;
+};
+
 extern "C" {
 
 // Returns the number of available TPU core count.
@@ -49,9 +56,14 @@ TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
     const XLA_TpuMeshState* mesh_state, TpuCoreTypeEnum tpu_core_type);
 
 // Creates a unique compilation cache `key` used for `put` and `get` operations.
-// Returned buffer is heap-allocated and must be owned.
-TFTPU_CAPI_EXPORT const char* TpuCompile_CreateCompilationCacheKey(
-    CompilationCacheKeyProperty property);
+// Returned buffers are heap-allocated and must be owned.
+TFTPU_CAPI_EXPORT CompilationCacheKeyResult
+TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty property);
+
+// Destroys the CompilationCacheKeyResult returned by calling the
+// `TpuCompile_CreateCompilationCacheKey` API.
+TFTPU_CAPI_EXPORT void TpuCompile_DestroyCompilationCacheKey(
+    CompilationCacheKeyResult result);
 
 // Creates a guaranteed const fingerprint. Guarantee const is normally used in
 // TPU inference to avoid re-copying unchanged variables onto the TPU device.
@@ -75,6 +87,7 @@ TFTPU_CAPI_EXPORT void TpuCompile_BuildXLADeviceAssignment(
 struct TfTpu_CompileApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
+  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateGuaranteedConstFingerprint);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAheadOfTime);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_BuildXLADeviceAssignment);
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
new file mode 100644
index 00000000000..e2f717fea8b
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -0,0 +1,151 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_op_util.h"
+
+#include <string>
+
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
+
+namespace tensorflow {
+namespace tpu {
+namespace {
+// Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
+// data to compute the fingerprint.
+std::string GuaranteedConstFingerprint(
+    const string& fingerprint_in_metadata,
+    const OpInputList& guaranteed_constants) {
+  if (fingerprint_in_metadata.empty()) {
+    uint64_t fingerprint = 0;
+    for (const auto& constant : guaranteed_constants) {
+      fingerprint = TpuCompile_CreateGuaranteedConstFingerprint(
+          fingerprint, constant.tensor_data().data(),
+          constant.tensor_data().size());
+    }
+    return std::to_string(fingerprint);
+  } else {
+    return fingerprint_in_metadata;
+  }
+}
+
+std::string CreateShapePrefix(
+    const std::vector<tensorflow::TensorShape>& dynamic_shapes) {
+  std::string shapes_prefix;
+  for (const TensorShape& shape : dynamic_shapes) {
+    for (int64 size : shape.dim_sizes()) {
+      absl::StrAppend(&shapes_prefix, size, ",");
+    }
+    absl::StrAppend(&shapes_prefix, ";");
+  }
+  return shapes_prefix;
+}
+
+// Include compilation configurations of the arguments that are not captured
+// by the called graph.
+std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
+  std::string config_prefix;
+  for (const auto& arg : metadata.args()) {
+    if (arg.is_same_data_across_replicas()) {
+      absl::StrAppend(&config_prefix, ":s");
+      // Same.
+    } else {
+      // Different.
+      absl::StrAppend(&config_prefix, ":");
+    }
+    if (arg.enable_xla_sharding() ==
+        tpu::TPUCompileMetadataProto::Arg::ALLOWED) {
+      // Enabled.
+      absl::StrAppend(&config_prefix, "e");
+    }
+    if (arg.unrestricted_layout()) {
+      // Unrestricted.
+      absl::StrAppend(&config_prefix, ":u");
+    }
+    absl::StrAppend(&config_prefix, ",type(", arg.dtype(), ")");
+    if (arg.has_shape()) {
+      absl::StrAppend(&config_prefix, ",shape(");
+      for (const auto& dim : arg.shape().dim()) {
+        absl::StrAppend(&config_prefix, dim.size(), ",");
+      }
+      absl::StrAppend(&config_prefix, ")");
+    }
+  }
+  return config_prefix;
+}
+}  // namespace
+
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state) {
+  VLOG(1) << "FunctionLibraryFingerprint:" << function_library_fingerprint;
+  std::string shapes_prefix = CreateShapePrefix(dynamic_shapes);
+  VLOG(1) << "shapes_prefix = " << shapes_prefix;
+  std::string config_prefix = CreateConfigPrefix(metadata);
+  VLOG(1) << "config_prefix = " << config_prefix;
+  std::vector<int32_t> flattened_device_ids;
+  if (metadata.has_device_assignment()) {
+    for (const auto& device :
+         metadata.device_assignment().computation_devices()) {
+      flattened_device_ids.insert(flattened_device_ids.end(),
+                                  device.replica_device_ids().begin(),
+                                  device.replica_device_ids().end());
+    }
+  }
+  CompilationCacheKeyResult result =
+      TpuCompile_CreateCompilationCacheKey(CompilationCacheKeyProperty{
+          config_prefix.data(),
+          shapes_prefix.data(),
+          function_name.data(),
+          mlir_module.data(),
+          flattened_device_ids.data(),
+          flattened_device_ids.size(),
+          guaranteed_constants.size(),
+          function_library_fingerprint,
+          metadata.num_cores_per_replica(),
+          metadata.num_replicas(),
+          mesh_state.data(),
+      });
+  auto buffer_cleanup = gtl::MakeCleanup(
+      [result]() { TpuCompile_DestroyCompilationCacheKey(result); });
+  TpuCompilationCacheKey key;
+  key.prefix = result.key;
+  key.debug_string = result.debug_string;
+
+  // Guaranteed constants can be different across sessions. Use session_handle
+  // and guaranteed_const fingerprint to guarantee no collision.
+  if (guaranteed_constants.size() > 0) {
+    key.has_guaranteed_const = true;
+    key.session_handle = metadata.session_handle();
+    // Both `metadata` and `guaranteed_constants` lifetime are captured by
+    // reference based on the assumption that these variables lifetime is
+    // managed through the `TPUCompileOpKernelImpl` that outlives the
+    // lifetime of the compilation cache lookups.
+    string fingerprint;
+    key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
+                                        fingerprint]() mutable {
+      if (fingerprint.empty()) {
+        fingerprint = GuaranteedConstFingerprint(
+            metadata.guaranteed_const_fingerprint(), guaranteed_constants);
+      }
+      return fingerprint;
+    };
+  }
+  return key;
+}
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.h b/tensorflow/core/tpu/kernels/tpu_op_util.h
new file mode 100644
index 00000000000..0a9657ca05e
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
+#include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+// Creates a unique compilation cache `key`.
+TpuCompilationCacheKey CreateCompilationCacheKey(
+    absl::string_view function_name, uint64 function_library_fingerprint,
+    absl::string_view mlir_module, const OpInputList& guaranteed_constants,
+    const std::vector<TensorShape>& dynamic_shapes,
+    const TPUCompileMetadataProto& metadata,
+    const TpuMeshStateInterface& mesh_state);
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_UTIL_H_

From 9d33f296d1edc2f656e253cf2a015d36daedd5c1 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Thu, 18 Jun 2020 15:00:50 -0700
Subject: [PATCH 062/112] Prep change for publishing TPU Ops.

PiperOrigin-RevId: 317188030
Change-Id: I29f9236c0ade6bf586c8a52ead977b5d31aec357
---
 tensorflow/core/tpu/kernels/BUILD             | 11 +++
 .../tpu_compilation_cache_entry_unloader.h    | 69 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 9d38eb71f3c..a41747ee8c5 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -450,3 +450,14 @@ cc_library(
     hdrs = ["tpu_compile_op.h"],
     deps = ["//tensorflow/core:framework"],
 )
+
+cc_library(
+    name = "tpu_compilation_cache_entry_unloader",
+    hdrs = ["tpu_compilation_cache_entry_unloader.h"],
+    deps = [
+        ":tpu_compilation_cache_interface",
+        "//tensorflow/core:framework",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
new file mode 100644
index 00000000000..c298d8fcc12
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+namespace tpu {
+
+class TpuCompilationCacheEntryUnloader : public ResourceBase {
+ public:
+  explicit TpuCompilationCacheEntryUnloader(TpuCompilationCacheInterface* cache)
+      : cache_(cache) {
+    // Hold a reference to the cache until the unloader is destroyed.
+    cache_->Ref();
+    VLOG(1) << "Will unload compilation cache entries when session closes.";
+  }
+
+  ~TpuCompilationCacheEntryUnloader() override {
+    absl::MutexLock lock(&mu_);
+    for (int64 uid : cache_entry_uids_) {
+      Status s = cache_->MarkEntryForEviction(uid);
+      if (!s.ok()) {
+        LOG(WARNING) << "MarkEntryForEviction in "
+                        "~CompilationCacheEntryUnloader fails with error "
+                     << s;
+      }
+    }
+    // Release our reference to the cache.
+    cache_->Unref();
+  }
+
+  // Add cache entry uid to be unloaded in destructor.
+  void AddCacheEntryUid(int64 uid) {
+    absl::MutexLock lock(&mu_);
+    cache_entry_uids_.insert(uid);
+  }
+
+  std::string DebugString() const override {
+    return "CompilationCacheEntryUnloader";
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TpuCompilationCacheEntryUnloader);
+  mutable absl::Mutex mu_;
+  TpuCompilationCacheInterface* cache_;  // Not owned.
+  absl::flat_hash_set<int64> cache_entry_uids_ ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_COMPILATION_CACHE_ENTRY_UNLOADER_H_

From 35b978db57eaa87f32e7c9c3e9a7c323e595c978 Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Thu, 18 Jun 2020 15:00:53 -0700
Subject: [PATCH 063/112] Move tfl-device-index-selector to TF directory.

There's nothing lite-specific about this pass.

PiperOrigin-RevId: 317188038
Change-Id: Iac9799e296e043aabf7aeabec2e8f72d07c77178
---
 tensorflow/compiler/mlir/lite/BUILD                  |  1 -
 tensorflow/compiler/mlir/lite/tf_tfl_passes.cc       |  2 +-
 tensorflow/compiler/mlir/lite/transforms/passes.h    |  3 ---
 tensorflow/compiler/mlir/tensorflow/BUILD            |  1 +
 .../transforms/device_index_selector.cc              | 12 ++++++------
 .../compiler/mlir/tensorflow/transforms/passes.h     |  3 +++
 .../tests/tf_device_index_selector.mlir              |  2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)
 rename tensorflow/compiler/mlir/{lite => tensorflow}/transforms/device_index_selector.cc (92%)
 rename tensorflow/compiler/{mlir/lite => tensorflow}/tests/tf_device_index_selector.mlir (94%)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 8e9d615053c..8d4efeb3d60 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -314,7 +314,6 @@ tf_cc_test(
 cc_library(
     name = "tensorflow_lite_legalize_tf",
     srcs = [
-        "transforms/device_index_selector.cc",
         "transforms/dilated_conv.cc",
         "transforms/generated_legalize_tf.inc",
         "transforms/generated_lower_static_tensor_list.inc",
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 008098f62ba..fed2896035b 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -63,7 +63,7 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
   standard_pipeline_options.enable_inliner = false;
   standard_pipeline_options.form_clusters = pass_config.form_clusters;
   mlir::TF::CreateTFStandardPipeline(*pass_manager, standard_pipeline_options);
-  pass_manager->addPass(mlir::TFL::CreateDeviceIndexSelectorPass());
+  pass_manager->addPass(mlir::TF::CreateDeviceIndexSelectorPass());
 
   if (pass_config.shape_inference) {
     pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 01e5eb1cb68..105c9394fb4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -91,9 +91,6 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateWhileOutlinePass();
 // Verifies runtime constraints.
 std::unique_ptr<OperationPass<FuncOp>> CreateRuntimeVerifyPass();
 
-// Creates function pass to select device index/fold tf.DeviceIndex.
-std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass();
-
 }  // namespace TFL
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 54e57512c32..7c0d427e87b 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -475,6 +475,7 @@ cc_library(
         "transforms/cluster_outlining.cc",
         "transforms/collection_ops_util.cc",
         "transforms/decompose_resource_ops_pass.cc",
+        "transforms/device_index_selector.cc",
         "transforms/einsum.cc",
         "transforms/executor_island_coarsening.cc",
         "transforms/executor_tpuv1_inline_tpu_island.cc",
diff --git a/tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
similarity index 92%
rename from tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc
rename to tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
index d4aed750dc8..550647a915a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/device_index_selector.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/device_index_selector.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
 namespace mlir {
-namespace TFL {
+namespace TF {
 namespace {
 
 // Folds the DeviceIndex op to a constant value. The DeviceIndex return the
@@ -55,8 +55,8 @@ void DeviceIndexSelector::runOnOperation() {
   // Convert all the DeviceIndex ops to constant values.
   func.getBody().walk([](TF::DeviceIndexOp op) {
     // This just selects the default in all cases where DeviceIndex feeds into
-    // tf.Case. This could be enhanced based on explicit TFLite specification or
-    // TAC in future.
+    // tf.Case. This could be enhanced to have some sort of policy in the
+    // future.
     OpBuilder b(op);
     RankedTensorType type = RankedTensorType::get({}, b.getIntegerType(32));
     int index = op.device_names().size();
@@ -79,7 +79,7 @@ std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass() {
 }
 
 static PassRegistration<DeviceIndexSelector> pass(
-    "tfl-device-index-selector", "Fold tf.DeviceIndex to constant");
+    "tf-device-index-selector", "Fold tf.DeviceIndex to constant");
 
-}  // namespace TFL
+}  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index a34be28c809..168b317641d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -147,6 +147,9 @@ std::unique_ptr<OperationPass<FuncOp>> CreateLegalizeHloToTfPass();
 // generally used beyond exporting to runtimes that supports these ops. In the
 // future these fusions may be codegen'd automatically.
 std::unique_ptr<OperationPass<FuncOp>> CreateFusedKernelMatcherPass();
+
+// Creates function pass to select device index/fold tf.DeviceIndex.
+std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass();
 }  // namespace TF
 
 namespace tf_executor {
diff --git a/tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir b/tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
similarity index 94%
rename from tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir
rename to tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
index 1ac7f30d644..7fc2b210f91 100644
--- a/tensorflow/compiler/mlir/lite/tests/tf_device_index_selector.mlir
+++ b/tensorflow/compiler/tensorflow/tests/tf_device_index_selector.mlir
@@ -1,6 +1,6 @@
 // Test DeviceIndex selector.
 
-// RUN: tf-opt --tfl-device-index-selector %s | FileCheck %s
+// RUN: tf-opt --tf-device-index-selector %s | FileCheck %s
 
 // CHECK-LABEL: func @select
 func @select(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32>) {

From 834fe68f365e1d7f082b596fe87471ce84c2c8ec Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Thu, 18 Jun 2020 15:16:08 -0700
Subject: [PATCH 064/112] Optimized Arduino library by enabling precompilation
 Precompilation allows Arduino users to build their sketches much faster, but
 requires some support from the library properties to enable. This has
 recently been upgraded to suppor the 'full' mode, as shown in
 https://github.com/arduino/arduino-cli/pull/611, so we want to take advantage
 of this.

PiperOrigin-RevId: 317191283
Change-Id: Ie44a31ba45105f65fdad0da487290aff5fa2a179
---
 tensorflow/lite/micro/tools/make/templates/library.properties | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/make/templates/library.properties b/tensorflow/lite/micro/tools/make/templates/library.properties
index e41fd8d8fbe..6e02748a0b4 100644
--- a/tensorflow/lite/micro/tools/make/templates/library.properties
+++ b/tensorflow/lite/micro/tools/make/templates/library.properties
@@ -7,4 +7,5 @@ paragraph=This library runs TensorFlow machine learning models on microcontrolle
 category=Data Processing
 url=https://www.tensorflow.org/lite/microcontrollers/overview
 ldflags=-lm
-includes=TensorFlowLite.h
\ No newline at end of file
+includes=TensorFlowLite.h
+precompiled=full

From a82b75c82b63c4397b3d6a215e439ca77e687a84 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 15:35:42 -0700
Subject: [PATCH 065/112] [XLA/Client] Implement LocalClient::Run which
 supports buffer donation

PiperOrigin-RevId: 317195199
Change-Id: If4d35d0627fa068a0c2b522fdae52466abd21f51
---
 .../compiler/xla/client/local_client.cc       | 47 ++++---------------
 tensorflow/compiler/xla/client/local_client.h |  9 ----
 .../tests/multiple_devices_on_host_test.cc    |  3 +-
 tensorflow/compiler/xla/tests/while_test.cc   |  6 +--
 4 files changed, 12 insertions(+), 53 deletions(-)

diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index f71e8a2d56d..afe115deda8 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -168,26 +168,6 @@ LocalExecutable::RunHelper(const absl::Span<const Shape* const> argument_shapes,
   return std::make_pair(service_options, std::move(stream));
 }
 
-StatusOr<ExecutableRunOptions> LocalExecutable::GetExecutableRunOptions(
-    absl::Span<Shape const* const> argument_shapes,
-    const ExecutableRunOptions& run_options) {
-  TF_ASSIGN_OR_RETURN(auto options_and_stream,
-                      RunHelper(argument_shapes, run_options));
-  ExecutableRunOptions options = options_and_stream.first.run_options();
-  options.set_device_ordinal(-1);
-  return options;
-}
-
-template <typename T>
-static StatusOr<T> BlockHostUntilDoneAfterAsyncCall(
-    se::Stream* stream, std::function<StatusOr<T>()> async_callback) {
-  StatusOr<T> result = async_callback();
-  Status block_status = stream->BlockHostUntilDone();
-  TF_RETURN_IF_ERROR(result.status());
-  TF_RETURN_IF_ERROR(block_status);
-  return result;
-}
-
 StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutableRunOptions run_options) {
@@ -196,24 +176,15 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   for (const ShapedBuffer* const arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
   }
-  TF_ASSIGN_OR_RETURN(ExecutableRunOptions options,
-                      GetExecutableRunOptions(argument_shapes, run_options));
-  return BlockHostUntilDoneAfterAsyncCall<xla::ScopedShapedBuffer>(
-      options.stream(), [&] { return RunAsync(arguments, options); });
-}
-
-StatusOr<ExecutionOutput> LocalExecutable::Run(
-    std::vector<ExecutionInput> arguments, ExecutableRunOptions run_options) {
-  std::vector<const Shape*> argument_shapes;
-  argument_shapes.reserve(arguments.size());
-  for (const ExecutionInput& arg : arguments) {
-    argument_shapes.push_back(&arg.shape());
-  }
-  TF_ASSIGN_OR_RETURN(ExecutableRunOptions options,
-                      GetExecutableRunOptions(argument_shapes, run_options));
-  return BlockHostUntilDoneAfterAsyncCall<ExecutionOutput>(
-      options.stream(),
-      [&] { return RunAsync(argument_shapes, std::move(arguments), options); });
+  TF_ASSIGN_OR_RETURN(auto options_and_stream,
+                      RunHelper(argument_shapes, run_options));
+  ExecutableRunOptions options = options_and_stream.first.run_options();
+  options.set_device_ordinal(-1);
+  auto result = RunAsync(arguments, options);
+  Status block_status = options.stream()->BlockHostUntilDone();
+  TF_RETURN_IF_ERROR(result.status());
+  TF_RETURN_IF_ERROR(block_status);
+  return result;
 }
 
 static std::shared_ptr<HloSnapshot> DumpArguments(
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index b00f5cc6801..7cdeb9dcbf6 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -51,11 +51,6 @@ class LocalExecutable {
       const absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
-  // Similar to Run(), but allows for donating argument buffers to the
-  // executable.
-  StatusOr<ExecutionOutput> Run(std::vector<ExecutionInput> arguments,
-                                ExecutableRunOptions run_options);
-
   // Similar to Run(), but need not block the host waiting for the computation
   // to complete before returning.
   StatusOr<ScopedShapedBuffer> RunAsync(
@@ -90,10 +85,6 @@ class LocalExecutable {
       const absl::Span<const Shape* const> argument_shapes,
       ExecutableRunOptions run_options);
 
-  StatusOr<ExecutableRunOptions> GetExecutableRunOptions(
-      absl::Span<Shape const* const> argument_shapes,
-      const ExecutableRunOptions& run_options);
-
   // The ordinal of the device which this executable was compiled for. The
   // executable can run on all equivalent devices (as determined by
   // Backend::devices_equivalent).
diff --git a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
index 2231fc6feab..2b19aaded9c 100644
--- a/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
+++ b/tensorflow/compiler/xla/tests/multiple_devices_on_host_test.cc
@@ -45,8 +45,7 @@ void CompileAndExecute(
       xla::ClientLibrary::GetXlaService(client->platform())
           ->backend()
           .memory_allocator());
-  StatusOr<ScopedShapedBuffer> result =
-      executable->Run(absl::Span<const ShapedBuffer* const>(), execute_options);
+  StatusOr<ScopedShapedBuffer> result = executable->Run({}, execute_options);
   {
     absl::MutexLock lock(results_mutex);
     results->emplace_back(device_ordinal, std::move(result));
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 8e8c3605cc7..d575bbb1f3e 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -1324,16 +1324,14 @@ void BM_WhileLoop(int num_iters) {
   options.set_allocator(&allocator);
   const int kWarmups = 2;
   for (int i = 0; i < kWarmups; ++i) {
-    auto result =
-        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
+    auto result = executable->Run({}, options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result =
-        executable->Run(absl::Span<const ShapedBuffer* const>(), options);
+    auto result = executable->Run({}, options);
     ASSERT_TRUE(result.ok());
   }
 }

From 0deffad6acbc2f5848022bf8ae360c9adbdf1ef8 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 18 Jun 2020 15:50:38 -0700
Subject: [PATCH 066/112] Make `return_state` as explicit kwarg in the
 Conv2DLSTM layer.

It was previously hide in the **kwargs, and we are also missing documentation for it.

The existing test case should already cover the functionality of it.

PiperOrigin-RevId: 317197835
Change-Id: Icfae1e177eeb886b41345078f6b93f282a94df5b
---
 .../keras/layers/convolutional_recurrent.py   | 43 +++++++++++--------
 ...orflow.keras.layers.-conv-l-s-t-m2-d.pbtxt |  2 +-
 ...orflow.keras.layers.-conv-l-s-t-m2-d.pbtxt |  2 +-
 3 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 19831429b73..6c812204cba 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -753,7 +753,9 @@ class ConvLSTM2D(ConvRNN2D):
       the `recurrent_kernel` weights matrix.
     bias_constraint: Constraint function applied to the bias vector.
     return_sequences: Boolean. Whether to return the last output
-      in the output sequence, or the full sequence.
+      in the output sequence, or the full sequence. (default False)
+    return_state: Boolean Whether to return the last state
+      in addition to the output. (default False)
     go_backwards: Boolean (default False).
       If True, process the input sequence backwards.
     stateful: Boolean (default False). If True, the last state
@@ -786,22 +788,27 @@ class ConvLSTM2D(ConvRNN2D):
         `(samples, time, rows, cols, channels)`
 
   Output shape:
-    - If `return_sequences`
-       - If data_format='channels_first'
-          5D tensor with shape:
-          `(samples, time, filters, output_row, output_col)`
-       - If data_format='channels_last'
-          5D tensor with shape:
-          `(samples, time, output_row, output_col, filters)`
-    - Else
-      - If data_format ='channels_first'
-          4D tensor with shape:
-          `(samples, filters, output_row, output_col)`
-      - If data_format='channels_last'
-          4D tensor with shape:
-          `(samples, output_row, output_col, filters)`
-      where `o_row` and `o_col` depend on the shape of the filter and
-      the padding
+    - If `return_state`: a list of tensors. The first tensor is
+      the output. The remaining tensors are the last states,
+      each 4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
+      `rows` and `cols` values might have changed due to padding.
+    - If `return_sequences`: 5D tensor with shape:
+      `(samples, timesteps, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 5D tensor with shape:
+      `(samples, timesteps, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
+    - Else, 4D tensor with shape:
+      `(samples, filters, new_rows, new_cols)`
+      if data_format='channels_first'
+      or 4D tensor with shape:
+      `(samples, new_rows, new_cols, filters)`
+      if data_format='channels_last'.
 
   Raises:
     ValueError: in case of invalid constructor arguments.
@@ -834,6 +841,7 @@ class ConvLSTM2D(ConvRNN2D):
                recurrent_constraint=None,
                bias_constraint=None,
                return_sequences=False,
+               return_state=False,
                go_backwards=False,
                stateful=False,
                dropout=0.,
@@ -863,6 +871,7 @@ class ConvLSTM2D(ConvRNN2D):
                           dtype=kwargs.get('dtype'))
     super(ConvLSTM2D, self).__init__(cell,
                                      return_sequences=return_sequences,
+                                     return_state=return_state,
                                      go_backwards=go_backwards,
                                      stateful=stateful,
                                      **kwargs)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index f77d613e354..958d06a0d0f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -207,7 +207,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index f77d613e354..958d06a0d0f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -207,7 +207,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"

From e08382691bfb897d584c5d5a8e8a0abe0472373d Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Thu, 18 Jun 2020 15:54:03 -0700
Subject: [PATCH 067/112] Make "map_resources" overridable by subclass of
 `Trackable`.

This allows moving the implementation of map_resources from `tf.saved_model.save` to subclass of `Trackable`, e.g, Variable, DistributedVariable.

This is a non-functional change.

PiperOrigin-RevId: 317198449
Change-Id: I4aa48d4974b6547b5de8ac0f5c38f3da29d364bc
---
 tensorflow/python/distribute/BUILD            |  4 +--
 tensorflow/python/distribute/values.py        | 12 +++++++
 .../experimental/autocast_variable.py         |  7 ++++
 .../python/ops/resource_variable_ops.py       |  7 ++++
 tensorflow/python/saved_model/save.py         | 36 +++++--------------
 tensorflow/python/training/tracking/base.py   | 18 ++++++++++
 .../python/training/tracking/tracking.py      | 13 +++++++
 7 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 96559a9a740..7208807a18c 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -744,14 +744,12 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tf_export",
         "//tensorflow/python:type_spec",
-        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index d0ed27c69de..60b2ea4fe31 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training.saving import saveable_object
@@ -793,6 +794,17 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       return ops.convert_to_tensor(
           self._get(), dtype=dtype, name=name, as_ref=as_ref)
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    new_obj = resource_variable_ops.copy_to_graph_uninitialized(self._primary)
+    obj_map, resource_map = {}, {}
+    for v in self._values:
+      obj_map[v] = new_obj
+      resource_map[v.handle] = new_obj.handle
+    obj_map[self] = new_obj
+    resource_map[self] = new_obj.handle
+    return obj_map, resource_map
+
 
 class _DistributedVariableSaveable(saveable_object.SaveableObject):
   """Class for defining how to restore a DistributedVariable."""
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index 7d0abe30581..57e8ced65a0 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -285,6 +285,13 @@ class AutoCastVariable(variables.Variable, core.Tensor):
     # models with normal variables, and vice versa.
     return self._variable._gather_saveables_for_checkpoint()  # pylint:disable=protected-access
 
+  def _map_resources(self):
+    # By delegating this method to the wrapped variable, SavedModel with
+    # AutoCastVariables are identical to SavedModel with normal variables.
+    obj_map, resource_map = self._variable._map_resources()  # pylint:disable=protected-access
+    obj_map[self] = obj_map[self._variable]
+    return obj_map, resource_map
+
   # TODO(reedwm): Maybe encode the fact the variable is an AutoCastVariable in
   # to_proto().
   def to_proto(self, export_scope=None):
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 25f6347f034..cb235fcbe2d 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -633,6 +633,13 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     return gen_state_ops.resource_count_up_to(self.handle, limit=limit,
                                               T=self.dtype)
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    new_variable = copy_to_graph_uninitialized(self)
+    obj_map = {self: new_variable}
+    resource_map = {self._handle: new_variable.handle}
+    return obj_map, resource_map
+
   def _read_variable_op(self):
     variable_accessed(self)
 
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 5844c80995f..802ce1d61b7 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -19,14 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import copy
 import os
 
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.core.protobuf import saved_object_graph_pb2
-from tensorflow.python.distribute import distribute_utils as ds_utils
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
@@ -241,7 +239,7 @@ class _SaveableView(object):
     Creates resource handle ops in the current default graph, whereas
     `accessible_objects` will be from an eager context. Resource mapping adds
     resource handle ops to the main GraphDef of a SavedModel, which allows the
-    C++ loader API to interact with variables.
+    C++ loader API to interact with resources.
 
     Returns:
       A tuple of (object_map, resource_map, asset_info):
@@ -265,33 +263,15 @@ class _SaveableView(object):
         asset_index={})
 
     for node_id, obj in enumerate(self.nodes):
-      if isinstance(obj, tracking.CapturableResource):
-        new_obj = object_map[obj] = copy.copy(obj)
-        # pylint: disable=protected-access
-        with ops.device(obj._resource_device):
-          new_resource = new_obj._create_resource()
-        new_obj._resource_handle = new_resource
-        # pylint: enable=protected-access
-        resource_map[obj.resource_handle] = new_resource
-        self.captured_tensor_node_ids[obj.resource_handle] = node_id
-      elif (ds_utils.is_distributed_variable(obj) or
-            resource_variable_ops.is_resource_variable(obj)):
-        obj_to_copy = obj._primary if ds_utils.is_distributed_variable(  # pylint: disable=protected-access
-            obj) else obj
-        new_variable = resource_variable_ops.copy_to_graph_uninitialized(
-            obj_to_copy)
-        if ds_utils.is_distributed_variable(obj):
-          self.captured_tensor_node_ids[obj] = node_id
-          for v in obj.values:
-            object_map[v] = new_variable
-            resource_map[v.handle] = new_variable.handle
-            self.captured_tensor_node_ids[v.handle] = node_id
-        object_map[obj] = new_variable
-        resource_map[obj.handle] = new_variable.handle
-        self.captured_tensor_node_ids[obj.handle] = node_id
-      elif isinstance(obj, tracking.Asset):
+      if isinstance(obj, tracking.Asset):
         _process_asset(obj, asset_info, resource_map)
         self.captured_tensor_node_ids[obj.asset_path] = node_id
+      elif isinstance(obj, base.Trackable):
+        node_object_map, node_resource_map = obj._map_resources()  # pylint: disable=protected-access
+        for capturable in node_resource_map.keys():
+          self.captured_tensor_node_ids[capturable] = node_id
+        object_map.update(node_object_map)
+        resource_map.update(node_resource_map)
 
     # Note: some concrete functions can have been realized when tracing other
     # functions, and might closure-capture tensors from their parent functions.
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index e3cd9828724..ea76ad8db47 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -1021,3 +1021,21 @@ class Trackable(object):
     """
     del serialization_cache
     return dict()
+
+  def _map_resources(self):
+    """Makes new resource handle ops corresponding to existing resource tensors.
+
+    Internal sub-classes can override this to inform model saving how to add new
+    resource handle ops to the main GraphDef of a SavedModel (TF 1.x style
+    graph), which allows session based APIs (e.g, C++ loader API) to interact
+    with resources owned by this object.
+
+    Returns:
+      A tuple of (object_map, resource_map):
+        object_map: A dictionary mapping from objects that hold existing
+          resource tensors to replacement objects created to hold the new
+          resource tensors.
+        resource_map: A dictionary mapping from existing resource tensors to
+          newly created resource tensors.
+    """
+    return {}, {}
diff --git a/tensorflow/python/training/tracking/tracking.py b/tensorflow/python/training/tracking/tracking.py
index 553f0ec73bf..fb2735e6445 100644
--- a/tensorflow/python/training/tracking/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import functools
 import weakref
 
@@ -243,6 +244,18 @@ class CapturableResource(base.Trackable):
         self._resource_handle = self._create_resource()
     return self._resource_handle
 
+  def _map_resources(self):
+    """For implementing `Trackable`."""
+    new_obj = copy.copy(self)
+    # pylint: disable=protected-access
+    with ops.device(self._resource_device):
+      new_resource = new_obj._create_resource()
+    new_obj._resource_handle = new_resource
+    # pylint: enable=protected-access
+    obj_map = {self: new_obj}
+    resource_map = {self.resource_handle: new_resource}
+    return obj_map, resource_map
+
   def _list_functions_for_serialization(self, unused_functions):
     @def_function.function(input_signature=[], autograph=False)
     def _creator():

From 39504c25d9de697d3568bc4d370722d0f48376cf Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 18 Jun 2020 15:55:24 -0700
Subject: [PATCH 068/112] Fix bug in xla-legalize-tf-with-tf2xla pass by
 handling non-tensor operands

Currently, it only expects tensor operands but that is not applicable for non tensorflow dialect ops.

PiperOrigin-RevId: 317198672
Change-Id: I1387e664de740d044ef535f6903e07d63fa02f6d
---
 .../mlir/xla/tests/legalize-tf-with-tf2xla.mlir      | 12 ++++++++++--
 .../mlir/xla/transforms/legalize_tf_with_tf2xla.cc   |  6 +++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index b8a6df54519..86a7f2b9e09 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -35,7 +35,7 @@ func @not_whitelisted_op(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor
 // CHECK-LABEL: unranked_operand
 func @unranked_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: tf.Abs
-  // expected-remark@+1 {{lowering requires static shaped operands}}
+  // expected-remark@+1 {{lowering requires static shaped tensor operands}}
   %0 = "tf.Abs"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
 
   return %0 : tensor<*xf32>
@@ -44,12 +44,20 @@ func @unranked_operand(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: dynamic_operand
 func @dynamic_operand(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK: tf.Abs
-  // expected-remark@+1 {{lowering requires static shaped operands}}
+  // expected-remark@+1 {{lowering requires static shaped tensor operands}}
   %0 = "tf.Abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
 
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL: tuple_type
+func @tuple_type(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
+  // Verifies that the pass can handle operands of non-tensor type like tuple
+  // from non TensorFlow ops.
+  %0 = "xla_hlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
 // CHECK-LABEL: unsupported_dtype
 func @unsupported_dtype(%arg0: tensor<2x!tf.variant>) -> tensor<2x!tf.variant> {
   // CHECK: tf.AddN
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index e57d6938efb..ef79c8868bb 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -337,9 +337,9 @@ LogicalResult FuncLegalizer::LegalizeOp(Operation* op) {
 
   // Only static shaped operands are supported in XLA builders for now.
   for (Type ty : op->getOperandTypes()) {
-    auto ranked_ty = ty.cast<ShapedType>();
-    if (!ranked_ty.hasStaticShape()) {
-      op->emitRemark() << "lowering requires static shaped operands";
+    auto ranked_ty = ty.dyn_cast<ShapedType>();
+    if (!ranked_ty || !ranked_ty.hasStaticShape()) {
+      op->emitRemark() << "lowering requires static shaped tensor operands";
       return success();
     }
   }

From 8d34408863b650564076f148edad9f91508abf04 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 18 Jun 2020 16:02:23 -0700
Subject: [PATCH 069/112] Auto-generate following TensorFlow ops related to
 image

ResizeBilinearGrad ResizeBilinear AdjustContrastv2 ResizeNearestNeighbor
AdjustSaturation AdjustHue RGBToHSV HSVToRGB

PiperOrigin-RevId: 317199967
Change-Id: I1953acf599f2f7de686bda73b654e4c7b98dffd5
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 153 ++++++++++++++++++
 1 file changed, 153 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 3a5deb9c569..dcd083fc398 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -164,6 +164,81 @@ def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastable
   let hasFolder = 1;
 }
 
+def TF_AdjustContrastv2Op : TF_Op<"AdjustContrastv2", [NoSideEffect]> {
+  let summary = "Adjust the contrast of one or more images.";
+
+  let description = [{
+`images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+interpreted as `[height, width, channels]`.  The other dimensions only
+represent a collection of images, such as `[batch, height, width, channels].`
+
+Contrast is adjusted independently for each channel of each image.
+
+For each channel, the Op first computes the mean of the image pixels in the
+channel and then adjusts each component of each pixel to
+`(x - mean) * contrast_factor + mean`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$images,
+    F32Tensor:$contrast_factor
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_AdjustHueOp : TF_Op<"AdjustHue", [NoSideEffect]> {
+  let summary = "Adjust the hue of one or more images.";
+
+  let description = [{
+`images` is a tensor of at least 3 dimensions.  The last dimension is
+interpreted as channels, and must be three.
+
+The input image is considered in the RGB colorspace. Conceptually, the RGB
+colors are first mapped into HSV. A delta is then applied all the hue values,
+and then remapped back to RGB colorspace.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$images,
+    F32Tensor:$delta
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_AdjustSaturationOp : TF_Op<"AdjustSaturation", [NoSideEffect]> {
+  let summary = "Adjust the saturation of one or more images.";
+
+  let description = [{
+`images` is a tensor of at least 3 dimensions.  The last dimension is
+interpreted as channels, and must be three.
+
+The input image is considered in the RGB colorspace. Conceptually, the RGB
+colors are first mapped into HSV. A scale is then applied all the saturation
+values, and then remapped back to RGB colorspace.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32]>:$images,
+    F32Tensor:$scale
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_AllOp : TF_Op<"All", [NoSideEffect]> {
   let summary = [{
 Computes the "logical and" of elements across dimensions of a tensor.
@@ -3866,6 +3941,28 @@ tf.math.greater_equal(x, y) ==> [True, False, True, True]
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_HSVToRGBOp : TF_Op<"HSVToRGB", [NoSideEffect]> {
+  let summary = "Convert one or more images from HSV to RGB.";
+
+  let description = [{
+Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+value of the pixels. The output is only well defined if the value in `images`
+are in `[0,1]`.
+
+See `rgb_to_hsv` for a description of the HSV encoding.
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$images
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_HashTableV2Op : TF_Op<"HashTableV2", []> {
   let summary = "Creates a non-initialized hash table.";
 
@@ -6733,6 +6830,41 @@ the dimension is padded with zeros.
   TF_DerivedResultTypeAttr Tcomplex = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_RGBToHSVOp : TF_Op<"RGBToHSV", [NoSideEffect]> {
+  let summary = "Converts one or more images from RGB to HSV.";
+
+  let description = [{
+Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+value of the pixels. The output is only well defined if the value in `images`
+are in `[0,1]`.
+
+`output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+`output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+
+Usage Example:
+
+>>> blue_image = tf.stack([
+...    tf.zeros([5,5]),
+...    tf.zeros([5,5]),
+...    tf.ones([5,5])],
+...    axis=-1)
+>>> blue_hsv_image = tf.image.rgb_to_hsv(blue_image)
+>>> blue_hsv_image[0,0].numpy()
+array([0.6666667, 1. , 1. ], dtype=float32)
+  }];
+
+  let arguments = (ins
+    TF_FpTensor:$images
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_RandomGammaGradOp : TF_Op<"RandomGammaGrad", [NoSideEffect, ResultsBroadcastableShape]>,
                            WithBroadcastableBinOpBuilder {
   let summary = [{
@@ -7230,6 +7362,27 @@ Input images can be of different types but output images are always float.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_ResizeBilinearGradOp : TF_Op<"ResizeBilinearGrad", [NoSideEffect]> {
+  let summary = "Computes the gradient of bilinear interpolation.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    F32Tensor:$grads,
+    TF_FpTensor:$original_image,
+
+    DefaultValuedAttr<BoolAttr, "false">:$align_corners,
+    DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
+  );
+
+  let results = (outs
+    TF_FpTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_ResizeNearestNeighborOp : TF_Op<"ResizeNearestNeighbor", [NoSideEffect]> {
   let summary = [{
 Resize `images` to `size` using nearest neighbor interpolation.

From 67544cd4bbdf8070adebbb077439cac300f479ca Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Thu, 18 Jun 2020 16:02:51 -0700
Subject: [PATCH 070/112] Add more outside compilation tests including multiple
 clusters and more variety of inputs/outputs.

PiperOrigin-RevId: 317200078
Change-Id: Id26e99059097073299ef5f681fae053b082ec149
---
 .../tpu/tpu_outside_compilation_test.py       | 95 ++++++++++++++++++-
 1 file changed, 91 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index f7ecb294c44..54c2598324c 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -18,13 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+import numpy as np
+
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.tpu import tpu
@@ -52,7 +57,7 @@ def get_tpu_strategy():
   return tpu_lib.TPUStrategy(resolver)
 
 
-class TpuOutsideCompilationTest(test.TestCase):
+class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
 
   def testResourceVariableAssignOnHost(self):
     strategy = get_tpu_strategy()
@@ -79,6 +84,26 @@ class TpuOutsideCompilationTest(test.TestCase):
     self.assertAllEqual(4.0 * strategy.num_replicas_in_sync, v2.numpy())
     self.assertAllEqual(5.0, v.numpy())
 
+  def testHostNoInput(self):
+    strategy = get_tpu_strategy()
+
+    def outside_fn():
+      logging_ops.print_v2("Outside compiled")
+
+    @def_function.function
+    def train_step():
+
+      def tpu_fn(x):
+        x2 = x + 5.0
+        tpu.outside_compilation(outside_fn)
+        return x2 + 5.0
+
+      return strategy.run(tpu_fn, args=(25.0,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(35., shape=(strategy.num_replicas_in_sync)))
+
   def testHostInputOnly(self):
     strategy = get_tpu_strategy()
 
@@ -120,13 +145,71 @@ class TpuOutsideCompilationTest(test.TestCase):
         strategy.experimental_local_results(train_step()),
         constant_op.constant(36., shape=(strategy.num_replicas_in_sync)))
 
-  def testOutsideCompilationControlFlowIf(self):
+  def testHostMultipleInputs(self):
+    strategy = get_tpu_strategy()
+    val0 = np.arange(6).reshape((2, 3)).astype(np.float32)
+    val1 = np.arange(6).reshape((3, 2)).astype(np.float32)
+
+    def outside_fn(arg0, arg1):
+      tmp = array_ops.reshape(arg1, array_ops.shape(arg0))
+      ret0 = arg0 + tmp
+      ret1 = math_ops.matmul(arg0, arg1)
+      ret2 = array_ops.concat([arg0, tmp], 0)
+      return ret0, ret1, ret2
+
+    @def_function.function
+    def train_step():
+
+      def tpu_fn(x, y):
+        a = x + 7.0
+        b = y * 2.0
+        c, d, e = tpu.outside_compilation(outside_fn, a, b)
+        return (math_ops.reduce_max(c) + math_ops.reduce_min(d) +
+                math_ops.reduce_sum(e))
+
+      return strategy.run(tpu_fn, args=(val0, val1))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(213., shape=(strategy.num_replicas_in_sync)))
+
+  def testMultipleClusters(self):
+    strategy = get_tpu_strategy()
+
+    def outside_fn1(x):
+      logging_ops.print_v2("Outside compiled", x)
+      return x + 6.0
+
+    def outside_fn2(x):
+      logging_ops.print_v2("Outside compiled", x)
+      return x - 18.0
+
+    @def_function.function
+    def train_step():
+
+      def tpu_fn(x):
+        x2 = x + 5.0
+        output1 = tpu.outside_compilation(outside_fn1, x2)
+        x3 = output1 + 3.0
+        output2 = tpu.outside_compilation(outside_fn2, x3)
+        return output2
+
+      return strategy.run(tpu_fn, args=(25.0,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(21., shape=(strategy.num_replicas_in_sync)))
+
+  @parameterized.parameters((True), (False))
+  def testOutsideCompilationControlFlowIf(self, take_true_branch):
     strategy = get_tpu_strategy()
 
     def outside_fn(x):
       logging_ops.print_v2("Outside compiled", x)
       return x + 6.0
 
+    input_value = 51.0 if take_true_branch else 25.0
+
     @def_function.function
     def train_step():
 
@@ -137,11 +220,15 @@ class TpuOutsideCompilationTest(test.TestCase):
         else:
           return x2
 
-      return strategy.run(tpu_fn, args=(25.0,))
+      return strategy.run(tpu_fn, args=(input_value,))
 
+    output_value = 36.0
+    if take_true_branch:
+      output_value = 56.0
     self.assertAllEqual(
         strategy.experimental_local_results(train_step()),
-        constant_op.constant(36., shape=(strategy.num_replicas_in_sync)))
+        constant_op.constant(
+            output_value, shape=(strategy.num_replicas_in_sync)))
 
   def testOutsideCompilationControlFlowWhile(self):
     strategy = get_tpu_strategy()

From 4aea552e064cf92330e07e83a3b5a1ca2a7034d0 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Thu, 18 Jun 2020 16:15:22 -0700
Subject: [PATCH 071/112] Publishing tpu_op_consts to tpu kernels library.

PiperOrigin-RevId: 317202394
Change-Id: Ib6a1f350af7384513a3744084a9959ed86278d1f
---
 tensorflow/core/tpu/kernels/BUILD             | 11 +++++-
 .../kernels/tpu_compilation_cache_external.h  |  5 +--
 tensorflow/core/tpu/kernels/tpu_op_consts.cc  | 24 ++++++++++++
 tensorflow/core/tpu/kernels/tpu_op_consts.h   | 39 +++++++++++++++++++
 4 files changed, 74 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_consts.cc
 create mode 100644 tensorflow/core/tpu/kernels/tpu_op_consts.h

diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index a41747ee8c5..a9f2202cd45 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -321,6 +321,7 @@ cc_library(
         ":tpu_compile_c_api_hdrs",
         ":tpu_compile_op_support",
         ":tpu_mesh_state_interface",
+        ":tpu_op_consts",
         ":tpu_program_group",
         ":tpu_util",
         ":trace_util_hdrs",
@@ -433,7 +434,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
@@ -461,3 +461,12 @@ cc_library(
         "@com_google_absl//absl/synchronization",
     ],
 )
+
+cc_library(
+    name = "tpu_op_consts",
+    srcs = ["tpu_op_consts.cc"],
+    hdrs = ["tpu_op_consts.h"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
index fe251326a43..86615b15d4c 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -38,15 +38,12 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_compile_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_group.h"
 
 namespace tensorflow {
 namespace tpu {
 
-constexpr char kCompilationCacheResourceName[] = "tpu_compilation_cache";
-constexpr char kCompilationCacheUnloaderResourceName[] =
-    "tpu_compilation_cache_unloader";
-
 class TpuCompilationCacheExternal : public TpuCompilationCacheInterface {
  public:
   using Status = ::stream_executor::port::Status;
diff --git a/tensorflow/core/tpu/kernels/tpu_op_consts.cc b/tensorflow/core/tpu/kernels/tpu_op_consts.cc
new file mode 100644
index 00000000000..e5e1aacb3cc
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_consts.cc
@@ -0,0 +1,24 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
+
+namespace tensorflow {
+namespace tpu {
+const char kCompilationCacheResourceName[] = "tpu_compilation_cache";
+const char kCompiledProtoCacheResourceName[] = "tpu_proto_cache";
+const char kCompilationCacheUnloaderResourceName[] =
+    "tpu_compilation_cache_unloader";
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_op_consts.h b/tensorflow/core/tpu/kernels/tpu_op_consts.h
new file mode 100644
index 00000000000..25223b7e429
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_op_consts.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_
+
+#include "absl/base/attributes.h"
+
+namespace tensorflow {
+namespace tpu {
+
+// Resource names in the ResourceMgr.
+//
+// Name of cache for compiled TPU ISA protos. CompilationCache is created by
+// ConfigureDistributedTpuOp, so only the master has a CompilationCache.
+ABSL_CONST_INIT extern const char kCompilationCacheResourceName[];
+// Name of base class allowing Execute Ops to look up ISA protos.
+// CompiledProtoCache is created by InitializeHostForDistributedTpuOp, so each
+// tpu_worker has a CompiledProtoCache.
+ABSL_CONST_INIT extern const char kCompiledProtoCacheResourceName[];
+// Name of cache unloader for compiled TPU ISA protos. Cache unloader should be
+// put into TPU_SYSTEM device resource manager. Inference may use it to unload
+// cache entries created during lifetime of a DirectSession.
+ABSL_CONST_INIT extern const char kCompilationCacheUnloaderResourceName[];
+
+}  // namespace tpu
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_OP_CONSTS_H_

From aab151356d2334a9d6cec71ce5165e6e6c45c793 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 16:24:14 -0700
Subject: [PATCH 072/112] [pjrt] Refresh stream error status in strategic
 places to flush out silent failures.

PiperOrigin-RevId: 317204018
Change-Id: If75a3ad9ec846ce1621cdba92a2dc738b65b7001
---
 tensorflow/compiler/xla/pjrt/local_device_state.cc |  4 ++++
 tensorflow/compiler/xla/pjrt/pjrt_client.cc        | 12 +++++++++---
 tensorflow/stream_executor/stream.cc               |  7 ++++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/local_device_state.cc b/tensorflow/compiler/xla/pjrt/local_device_state.cc
index d173c891c95..a229e56001e 100644
--- a/tensorflow/compiler/xla/pjrt/local_device_state.cc
+++ b/tensorflow/compiler/xla/pjrt/local_device_state.cc
@@ -127,11 +127,15 @@ std::unique_ptr<se::Stream> LocalDeviceState::BorrowStreamFromPool() {
   } else {
     std::unique_ptr<se::Stream> stream = std::move(usage_stream_pool_.top());
     usage_stream_pool_.pop();
+    stream->RefreshStatus().IgnoreError();  // Can return error::Unimplemented
+    QCHECK(stream->ok());
     return stream;
   }
 }
 
 void LocalDeviceState::ReturnStreamToPool(std::unique_ptr<se::Stream> stream) {
+  stream->RefreshStatus().IgnoreError();  // Can return error::Unimplemented
+  QCHECK(stream->ok());
   absl::MutexLock lock(&mu_);
   usage_stream_pool_.push(std::move(stream));
 }
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index ef259cf1cfd..46f592100c9 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -751,16 +751,22 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::FromHostLiteral(
     // memory that has already been allocated, and a possible Event
     // allocation.
 
+    se::Stream* h2d_stream = local_device->host_to_device_stream();
     ShapedBuffer buffer = device_buffer->AsShapedBuffer(
         compact_shape, on_device_shape, client->client()->platform());
     TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
-        local_device->host_to_device_stream(), literal, buffer));
+        h2d_stream, literal, buffer));
 
     std::shared_ptr<BufferSequencingEvent> event =
         device_buffer->definition_events()[0];
     TF_CHECK_OK(AddDestinationBufferSynchronization(
-        local_device, std::move(device_buffer), event,
-        local_device->host_to_device_stream()));
+        local_device, std::move(device_buffer), event, h2d_stream));
+
+    // This can sometimes catch the case where the literal memory has been
+    // freed before the H2D transfer was issued.
+    h2d_stream->RefreshStatus()
+        .IgnoreError();  // Can return error::Unimplemented
+    QCHECK(h2d_stream->ok());
   };
   client->h2d_transfer_pool()->Schedule(transfer_h2d);
   return py_buffer;
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index c63565c65a8..da418122375 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -285,7 +285,12 @@ Stream::~Stream() {
 
 port::Status Stream::RefreshStatus() {
   port::Status status = parent_->GetStatus(this);
-  CheckStatus(status);
+  // We should not put the stream in an error state, just because the GetStatus
+  // method is unimplemented.
+  if (status != port::Status(port::error::UNIMPLEMENTED,
+                             "GetStatus is not supported on this executor.")) {
+    CheckStatus(status);
+  }
   return status;
 }
 

From 94e37f84f19384e685420ef7f90382fcfe719498 Mon Sep 17 00:00:00 2001
From: Dero Gharibian <dero@google.com>
Date: Thu, 18 Jun 2020 16:25:11 -0700
Subject: [PATCH 073/112] Remove unnecessary string copy.

PiperOrigin-RevId: 317204219
Change-Id: I85fab345945b6ea4f428f8aedc861eb79e5fd7e0
---
 tensorflow/core/kernels/summary_op.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index 386a8964dba..f4c91fc9ff1 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -52,7 +52,8 @@ class SummaryScalarOp : public OpKernel {
     Summary s;
     for (int i = 0; i < Ttags.size(); i++) {
       Summary::Value* v = s.add_value();
-      v->set_tag(string(Ttags(i)));  // NOLINT
+      const tstring& Ttags_i = Ttags(i);
+      v->set_tag(Ttags_i.data(), Ttags_i.size());
       v->set_simple_value(float(Tvalues(i)));
     }
 
@@ -102,7 +103,8 @@ class SummaryHistoOp : public OpKernel {
 
     Summary s;
     Summary::Value* v = s.add_value();
-    v->set_tag(string(tags.scalar<tstring>()()));  // NOLINT
+    const tstring& tags0 = tags.scalar<tstring>()();
+    v->set_tag(tags0.data(), tags0.size());
     histo.EncodeToProto(v->mutable_histo(), false /* Drop zero buckets */);
 
     Tensor* summary_tensor = nullptr;

From 83b09270dc34308ef60f2f68de540a4cb213e1e4 Mon Sep 17 00:00:00 2001
From: Lucy Fox <lucyfox@google.com>
Date: Thu, 18 Jun 2020 16:52:46 -0700
Subject: [PATCH 074/112] Update FusedKernelMatcher pass to use upstream util
 to get stripped op name.

Added this util upstream in D81435, so now using that instead and deleting the unneeded code here.

PiperOrigin-RevId: 317209256
Change-Id: Id2d8a1fca34ca85e59a05a85bf7f6f59b425c7c1
---
 .../tensorflow/transforms/fused_kernel_matcher.cc     | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index 4b10550df7b..d10f5e26e8f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -52,11 +52,6 @@ struct FusedKernelMatcherPass
   void runOnFunction() override;
 };
 
-// Returns an op's name with the dialect prefix stripped off.
-StringRef GetOpNameWithoutDialect(Operation *op) {
-  return op->getName().getStringRef().split(".").second;
-}
-
 bool IsActivationFunction(Operation *op) {
   return isa<EluOp>(op) || isa<ReluOp>(op) || isa<Relu6Op>(op);
 }
@@ -128,8 +123,8 @@ class FuseContractionWithBiasAdd : public OpRewritePattern<SrcOpT> {
     }
 
     SmallVector<Location, 3> locations{contraction.getLoc(), bias_add.getLoc()};
-    SmallVector<Attribute, 2> fused_ops{
-        StringAttr::get(GetOpNameWithoutDialect(bias_add), context)};
+    SmallVector<Attribute, 2> fused_ops{StringAttr::get(
+        bias_add.getOperation()->getName().stripDialect(), context)};
 
     // BiasAdd may or may not feed into an activation function.
     auto activation = GetActivation(bias_add);
@@ -143,7 +138,7 @@ class FuseContractionWithBiasAdd : public OpRewritePattern<SrcOpT> {
     if (fuse_activation) {
       locations.push_back(activation->getLoc());
       fused_ops.push_back(
-          StringAttr::get(GetOpNameWithoutDialect(activation), context));
+          StringAttr::get(activation->getName().stripDialect(), context));
       result_type = activation->getResultTypes().front();
     } else {
       result_type = bias_add.getResult().getType();

From e96543f6fbba9fa112d7ca1d731b64e3654e1629 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Thu, 18 Jun 2020 16:55:26 -0700
Subject: [PATCH 075/112] Add MultiProcessPoolRunner

Tensorflow initialization can take a long time when GPUs are present. We cannot afford starting a new group of workers for every single test. MultiProcessPoolRunner uses a pool of workers so that we can avoid the initialization cost. Compared to MultiProcessRunner, it doesn't support terminating workers.

Note that implementation wise we could build MultiProcessPoolRunner on top of MultiProcessRunner or vice-versa if there's no need to support termination. Since it's easier for MultiProcessPoolRunner not to support termination, we choose MultiProcessPoolRunner on top of MultiProcessRunner.

PiperOrigin-RevId: 317209754
Change-Id: Ia439028c81c5a9f87b16d631a170158724ce47d4
---
 .../python/distribute/multi_process_runner.py | 205 ++++++++++++++++--
 .../distribute/multi_process_runner_test.py   |  69 +++++-
 2 files changed, 249 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index 8699e59b410..ce36287a9da 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -67,8 +67,7 @@ except ImportError:
 # exception stack trace info is stored in exc_info to pass on to parent process
 # to be re-raised.
 _ProcessStatusInfo = collections.namedtuple(
-    '_ProcessStatusInfo',
-    ['task_type', 'is_successful', 'exc_info', 'return_value'])
+    '_ProcessStatusInfo', ['is_successful', 'exc_info', 'return_value'])
 
 # Information returned from a successful MultiProcessRunner run.
 MultiProcessRunnerResult = collections.namedtuple('MultiProcessRunnerResult',
@@ -124,6 +123,7 @@ class MultiProcessRunner(object):
                stream_stdout=True,
                list_stdout=False,
                use_dill_for_args=True,
+               daemon=False,
                args=None,
                kwargs=None):
     """Creates a multi-process runner.
@@ -157,6 +157,7 @@ class MultiProcessRunner(object):
       use_dill_for_args: Whether to use dill to pickle `args` and `kwargs`. dill
         can pickle more objects, but doesn't work with types in
         `multiprocessing` library like `Mutex`.
+      daemon: Whether to start processes as daemons.
       args: Positional arguments to be sent to functions run on processes.
       kwargs: Keyword arguments to be sent to functions run on processes.
 
@@ -188,6 +189,7 @@ class MultiProcessRunner(object):
     self._list_stdout = list_stdout
     self._dependence_on_chief = True
     self._use_dill_for_args = use_dill_for_args
+    self._daemon = daemon
     self._args = args or ()
     self._kwargs = kwargs or {}
 
@@ -268,7 +270,8 @@ class MultiProcessRunner(object):
         test_env=test_env,
         target=_ProcFunc(),
         args=(resources, test_env, proc_func, args, kwargs,
-              self._use_dill_for_args))
+              self._use_dill_for_args),
+        daemon=self._daemon)
     p.start()
     self._processes[(task_type, task_id)] = p
     self._outstanding_subprocess_count += 1
@@ -568,7 +571,6 @@ class _ProcFunc(object):
         time.sleep(0.1)
     self._resources.process_status_queue.put(
         _ProcessStatusInfo(
-            task_type=task_type,
             is_successful=True,
             exc_info=None,
             return_value=None))
@@ -628,17 +630,9 @@ class _ProcFunc(object):
     if test_env.v2_enabled:
       v2_compat.enable_v2_behavior()
 
-    try:
-      with self._runtime_mode(test_env.executing_eagerly):
-        return_value = proc_func(*args, **kwargs)
-        is_successful = True
-        exc_info = None
-
-    except Exception:  # pylint: disable=broad-except
-      # Capture all exceptions to be reported to parent process.
-      return_value = None
-      is_successful = False
-      exc_info = sys.exc_info()
+    with self._runtime_mode(test_env.executing_eagerly):
+      info = _run_contained(proc_func, args, kwargs)
+      self._resources.process_status_queue.put(info)
 
       # Re-raise the exception in addition to reporting it to the parent
       # process, so that even if `--test_timeout` flag is set and the
@@ -647,18 +641,183 @@ class _ProcFunc(object):
       # instead of silently suppressing the error due to early bazel
       # timeout. Raising an error in the subprocess produces stack trace in
       # the log, but the program continues running.
-      raise
+      if not info.is_successful:
+        six.reraise(*info.exc_info)
 
-    finally:
-      info = _ProcessStatusInfo(
-          task_type=test_env.task_type,
-          is_successful=is_successful,
-          exc_info=exc_info,
-          return_value=return_value)
-      self._resources.process_status_queue.put(info)
       self._close_streaming()
 
 
+class MultiProcessPoolRunner(object):
+  """A utility class to start a process pool to simulate a cluster.
+
+  It's similar to MultiProcessRunner, but uses a pool of processes to avoid the
+  expensive initialization cost of Tensorflow.
+  """
+
+  def __init__(self, cluster_spec, initializer=None):
+    """Creates a multi-process pool runner.
+
+    Args:
+      cluster_spec: Dict for cluster spec. The following is an example of
+        cluster with three workers.
+        {"worker": ["worker0.example.com:2222",
+                    "worker1.example.com:2222",
+                    "worker2.example.com:2222"]}
+      initializer: a callable to called at the startup of worker processes.
+
+    Raises:
+      RuntimeError: if `multi_process_runner.test_main()` is not called.
+      ValueError: if there are more than one chief in the `cluster_spec`.
+    """
+    self._cluster_spec = cluster_spec
+    self._initializer = initializer
+    self._conn = {}
+    self._runner = None
+
+  def __del__(self):
+    self._reset()
+
+  def _reset(self):
+    for conn in self._conn.values():
+      conn.close()
+    self._conn = {}
+    if self._runner is not None:
+      self._runner.join()
+      self._runner = None
+
+  def _start(self):
+    """Starts the worker pool."""
+    # We need different arguments for different processes so we're passing a
+    # no-op proc_func here and use start_single_process instead.
+    #
+    # We also need to start the process pool as daemon, so that they don't block
+    # the program from exiting. Note that __del__ may not get called when
+    # there's an exception. The user may also store a pool runner in a global
+    # object to share across test cases
+    self._runner = MultiProcessRunner(
+        proc_func=lambda: None,
+        cluster_spec=self._cluster_spec,
+        use_dill_for_args=False,
+        daemon=True)
+    if self._initializer:
+      initializer = dill.dumps(self._initializer, dill.HIGHEST_PROTOCOL)
+    else:
+      initializer = None
+    for task_type, addresses in self._cluster_spec.items():
+      for task_id, _ in enumerate(addresses):
+        conn1, conn2 = multiprocessing.Pipe(duplex=True)
+        self._conn[(task_type, task_id)] = conn1
+        self._runner.start_single_process(
+            task_type,
+            task_id,
+            proc_func=_pool_runner_worker,
+            args=(initializer, conn2))
+
+  def run(self, proc_func, args=None, kwargs=None):
+    """Runs `proc_func` with `args` and `kwargs` on all jobs.
+
+    Args:
+      proc_func: The function to be run.
+      args: Optional positional arguments to be supplied in `proc_func`.
+      kwargs: Optional keyword arguments to be supplied in `proc_func`.
+
+    Returns:
+      A list of return values.
+    """
+    if self._runner is None:
+      self._start()
+
+    # Since we start the processes as daemon they're going to be killed by
+    # SIGTERM when the program exits. We only turn on streaming during run() to
+    # avoid printing the stacktrace caused by the SIGTERM.
+    self._runner._stream_stdout = True  # pylint: disable=protected-access
+
+    try:
+      proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
+      for conn in self._conn.values():
+        conn.send((proc_func, args or [], kwargs or {}))
+
+      process_statuses = []
+      for (task_type, task_id), conn in self._conn.items():
+        logging.info('Waiting for the result from %s-%d', task_type, task_id)
+        try:
+          process_statuses.append(conn.recv())
+        except EOFError:
+          # This shouldn't happen due to exceptions in proc_func. This usually
+          # means bugs in the runner.
+          self._reset()
+          raise RuntimeError('Unexpected EOF. Worker process may have died. '
+                             'Please report a bug')
+
+      return_values = []
+      for process_status in process_statuses:
+        assert isinstance(process_status, _ProcessStatusInfo)
+        if not process_status.is_successful:
+          six.reraise(*process_status.exc_info)
+        if process_status.return_value is not None:
+          return_values.append(process_status.return_value)
+
+      return return_values
+    finally:
+      self._runner._stream_stdout = False  # pylint: disable=protected-access
+
+
+def _pool_runner_worker(initializer, conn):
+  """Function that runs on the workers in a pool.
+
+  It listens for callables to run and returns the result until `conn` is closed.
+  It captures the exceptions during executing the callable and return it through
+  `conn`.
+
+  Args:
+    initializer: A callable to execute during startup.
+    conn: A multiprocessing.Connection object to listen for tasks and send
+      results.
+  """
+  if initializer:
+    initializer = dill.loads(initializer)
+    initializer()
+  while True:
+    try:
+      proc_func, args, kwargs = conn.recv()
+    except EOFError:
+      break
+    proc_func = dill.loads(proc_func)
+    info = _run_contained(proc_func, args, kwargs)
+    sys.stdout.flush()
+    sys.stderr.flush()
+    conn.send(info)
+
+
+def _run_contained(proc_func, args, kwargs):
+  """Runs `proc_func` with `args` and `kwargs`.
+
+  The function returns _ProcessStatusInfo which captures the return value and
+  the exception.
+
+  Args:
+    proc_func: The function to be run.
+    args: Optional positional arguments to be supplied in `proc_func`.
+    kwargs: Optional keyword arguments to be supplied in `proc_func`.
+
+  Returns:
+    a _ProcessStatusInfo.
+  """
+  try:
+    return_value = proc_func(*args, **kwargs)
+    is_successful = True
+    exc_info = None
+  except Exception:  # pylint: disable=broad-except
+    return_value = None
+    is_successful = False
+    exc_info = sys.exc_info()
+  finally:
+    return _ProcessStatusInfo(  # pylint: disable=lost-exception
+        is_successful=is_successful,
+        exc_info=exc_info,
+        return_value=return_value)
+
+
 class SubprocessTimeoutError(RuntimeError):
   """An error that indicates there is at least one subprocess timing out.
 
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index aeba43b6b7c..d76ef5a5a3c 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -22,6 +22,8 @@ import json
 import os
 import threading
 import time
+import unittest
+
 from absl import logging
 
 from tensorflow.python.distribute import multi_process_runner
@@ -45,7 +47,7 @@ def proc_func_that_adds_simple_return_data():
   return 'dummy_data'
 
 
-def proc_func_that_return_args_and_kwargs(*args, **kwargs):
+def proc_func_that_returns_args_and_kwargs(*args, **kwargs):
   return list(args) + list(kwargs.items())
 
 
@@ -53,6 +55,20 @@ def proc_func_with_barrier():
   return multi_process_runner.barrier()
 
 
+def proc_func_that_returns_pid():
+  return os.getpid()
+
+
+V = None
+
+
+def proc_func_that_sets_global(val):
+  global V
+  old_val = V
+  V = val
+  return old_val
+
+
 class MultiProcessRunnerTest(test.TestCase):
 
   def _worker_idx(self):
@@ -95,7 +111,7 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_multi_process_runner_args_passed_correctly(self):
     return_value = multi_process_runner.run(
-        proc_func_that_return_args_and_kwargs,
+        proc_func_that_returns_args_and_kwargs,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         args=('a', 'b'),
         kwargs={
@@ -325,5 +341,54 @@ class MultiProcessRunnerTest(test.TestCase):
                 for line in list_to_assert))
 
 
+class MultiProcessPoolRunnerTest(test.TestCase):
+
+  def test_same_process_across_runs(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    pid = runner.run(proc_func_that_returns_pid)
+    for _ in range(3):
+      self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+
+  def test_exceptions_in_sub_process(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    pid = runner.run(proc_func_that_returns_pid)
+    with self.assertRaisesRegexp(ValueError, 'This is an error.'):
+      runner.run(proc_func_that_errors)
+    self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+
+  def test_tf_config(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    result = runner.run(proc_func_that_adds_task_type_in_return_data)
+
+    job_count_dict = {'worker': 2, 'chief': 1}
+    for data in result:
+      job_count_dict[data] -= 1
+
+    self.assertEqual(job_count_dict['worker'], 0)
+    self.assertEqual(job_count_dict['chief'], 0)
+
+  @unittest.expectedFailure
+  def test_exception_in_main_process(self):
+    # When there's an exception in the main process, __del__() is not called.
+    # This test is to verify MultiProcessPoolRunner can cope with __del__() not
+    # being called.
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
+    runner.run(proc_func_that_returns_pid)
+    raise ValueError('failure')
+
+  def test_initializer(self):
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    runner = multi_process_runner.MultiProcessPoolRunner(
+        cluster_spec, initializer=lambda: proc_func_that_sets_global(1))
+    result = runner.run(proc_func_that_sets_global, args=(2,))
+    self.assertAllEqual(result, [1, 1])
+
+
 if __name__ == '__main__':
   multi_process_runner.test_main()

From ae76bc79213d4559b113899f438cf54283ec11c2 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Thu, 18 Jun 2020 17:19:14 -0700
Subject: [PATCH 076/112] Update XNNPACK dependency and document sparse
 inference capability

PiperOrigin-RevId: 317213816
Change-Id: I35431b40fd63d836d4fe979f65a71a181c0c820d
---
 tensorflow/lite/delegates/xnnpack/README.md | 37 +++++++++++++++++++++
 tensorflow/workspace.bzl                    |  8 ++---
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index 97d2d5565db..d94e92c7306 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -238,6 +238,43 @@ Below is the list of current operators and limitations:
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
 
+### Sparse Inference (experimental)
+
+XNNPACK backend supports sparse inference for CNN models described in the
+[Fast Sparse ConvNets](https://arxiv.org/abs/1911.09723) paper. This
+functionality must be enabled at build-time via
+`--define xnn_enable_sparse=true` Bazel flag. Sparse inference is restricted
+to subgraphs with the following operators:
+
+* Sparse subgraph must start with a 3x3 stride-2 `CONV_2D` operator with
+  padding 1 on each side, no dilation, and 3 input channels.
+* Sparse subgraph must end with a `MEAN` operator that does reduction across
+  spatial axes.
+* Sparse subgraph may contain the following operators:
+  * `CONV_2D` with 1x1 kernel and no padding. It is important to have high
+    sparsity (at least 70%) in the filter of this operator to get speedup
+    over dense inference.
+  * `DEPTHWISE_CONV_2D` with 3x3 kernel, stride 1, no dilation, and padding 1
+    on each side.
+  * `DEPTHWISE_CONV_2D` with 3x3 kernel, stride 2, no dilation, and padding 1
+    on each side.
+  * `DEPTHWISE_CONV_2D` with 5x5 kernel, stride 1, no dilation, and padding 2
+    on each side.
+  * `DEPTHWISE_CONV_2D` with 5x5 kernel, stride 2, no dilation, and padding 2
+    on each side.
+  * `ADD` and `MUL` operators where both inputs are 4D tensors. If one of the
+    inputs to `ADD` or `MUL` is a constant tensor, it must be representable as
+    either a scalar, or a 1D vector.
+  * Unary elementwise operators `ABS`, `CEIL`, `FLOOR`, `HARD_SWISH`,
+    `LEAKY_RELU`, `LOGISTIC`, `NEG`, `RELU`, `RELU6`, `RELU_N1_TO_1`, `ROUND`,
+    and `SQUARE`.
+
+Pre-trained [Fast Sparse ConvNets models](https://github.com/google-research/google-research/tree/master/fastconvnets)
+provide examples that satisfy these constrains.
+
+In addition to acceleration, sparse models get the compression benefit by
+storing only non-zero values in the [TensorFlow Lite file format](https://github.com/tensorflow/tensorflow/blob/4aea552e064cf92330e07e83a3b5a1ca2a7034d0/tensorflow/lite/schema/schema.fbs#L84-L109).
+
 ### Other limitations
 
 * Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f5b0b7537dc..52c573628ac 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "4af883fea0a6ada106867f29670a6c0b7af74bee85d74a2e04356a670814a3d4",
-        strip_prefix = "XNNPACK-69a6a7667d96a84c596b0f4e00632b2037c17723",
+        sha256 = "2527a30464b43bd03f137b2c455a0381e49eae63d09cfeee128a717dfbe962d5",
+        strip_prefix = "XNNPACK-8b283aa30a3186c6e640aed520543e9c067132d2",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/69a6a7667d96a84c596b0f4e00632b2037c17723.zip",
-            "https://github.com/google/XNNPACK/archive/69a6a7667d96a84c596b0f4e00632b2037c17723.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/8b283aa30a3186c6e640aed520543e9c067132d2.zip",
+            "https://github.com/google/XNNPACK/archive/8b283aa30a3186c6e640aed520543e9c067132d2.zip",
         ],
     )
 

From 723751b20ef2aa0a4af39cad2581fd483ae78ad7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 18:15:12 -0700
Subject: [PATCH 077/112] Add set_outfeed_config in XLA HloInstruction.

PiperOrigin-RevId: 317222410
Change-Id: I5de8a5067f1002a9d656d4e26d145ffe3fe372ed
---
 tensorflow/compiler/xla/service/hlo_instruction.cc | 4 ++++
 tensorflow/compiler/xla/service/hlo_instruction.h  | 3 +++
 tensorflow/compiler/xla/service/hlo_instructions.h | 1 +
 3 files changed, 8 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index cfa21b95dd2..6de76c1cc63 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -3908,6 +3908,10 @@ const string& HloInstruction::outfeed_config() const {
   return Cast<HloOutfeedInstruction>(this)->outfeed_config();
 }
 
+void HloInstruction::set_outfeed_config(const string& config) {
+  return Cast<HloOutfeedInstruction>(this)->set_outfeed_config(config);
+}
+
 const std::vector<ReplicaGroup>& HloInstruction::replica_groups() const {
   return Cast<HloCollectiveInstruction>(this)->replica_groups();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 7a5d506b681..f3bb59ff625 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1755,6 +1755,9 @@ class HloInstruction {
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const;
 
+  // Delegates to HloOutfeedInstruction::set_outfeed_config.
+  void set_outfeed_config(const string& config);
+
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 6da01dc088e..f5a963ef063 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1141,6 +1141,7 @@ class HloOutfeedInstruction : public HloInstruction {
   const Shape& outfeed_shape() const { return outfeed_shape_; }
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const { return outfeed_config_; }
+  void set_outfeed_config(const string& config) { outfeed_config_ = config; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 

From b7edd44ee0f8c264e457c48138474f6e1bf5b18e Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Thu, 18 Jun 2020 19:08:04 -0700
Subject: [PATCH 078/112] Enable type annotations for python/ops.

PiperOrigin-RevId: 317229132
Change-Id: I7055e650308c2fc83969385dd25e86fb5b073d75
---
 tensorflow/python/ops/logging_ops.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 8ca63f55987..02fce277690 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -54,11 +54,9 @@ except NameError:
 # call relies on certain conditionals for its dependencies.  Use
 # control_flow_ops.Assert.
 
-# Assert and Print are special symbols in python, so we must
-# have an upper-case version of them.
-#
-# For users with Python 3 or Python 2.7
-# with `from __future__ import print_function`, we could also allow lowercase.
+# Assert and Print are special symbols in Python 2, so we must
+# have an upper-case version of them. When support for it is dropped,
+# we can allow lowercase.
 # See https://github.com/tensorflow/tensorflow/issues/18053
 
 
@@ -83,11 +81,6 @@ def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
     with jupyter notebook (printing to the notebook *server's* output, not into
     the notebook).
 
-  Additionally, to use tf.print in python 2.7, users must make sure to import
-  the following:
-
-  `from __future__ import print_function`
-
   Args:
     input_: A tensor passed through this op.
     data: A list of tensors to print out when op is evaluated.
@@ -148,11 +141,6 @@ def print_v2(*inputs, **kwargs):
   Python objects. Printed tensors will recursively show the first and last
   elements of each dimension to summarize.
 
-  @compatibility(python2)
-  In python 2.7, make sure to import the following:
-  `from __future__ import print_function`
-  @end_compatibility
-
   Example:
     Single-input usage:
 

From 13fe5862de7b95fd91aeec8f2d71e9f2e77b699b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 19:11:15 -0700
Subject: [PATCH 079/112] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/c830d517b4e4

PiperOrigin-RevId: 317229564
Change-Id: I10163c3e668996252d294018794081394cc0d25c
---
 third_party/mlir/test.BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index 23287ce28d6..14c2ba7778e 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -166,6 +166,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TargetNVVMIR",
         "@llvm-project//mlir:TargetROCDLIR",

From 4a14e778d64853a236941259693aa3c5813c18d8 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Thu, 18 Jun 2020 19:11:40 -0700
Subject: [PATCH 080/112] [XLA] Introduce ManifestCheckingTest

PiperOrigin-RevId: 317229603
Change-Id: Ibcc9ea3895d520024f5d80d52330aeb3b970585d
---
 tensorflow/compiler/xla/tests/BUILD           |  23 +++-
 tensorflow/compiler/xla/tests/build_defs.bzl  |   7 +-
 .../xla/tests/client_library_test_base.h      |   3 +-
 tensorflow/compiler/xla/tests/hlo_test_base.h |   3 +-
 .../xla/tests/local_client_test_base.h        |   3 +-
 .../xla/tests/manifest_checking_test.cc       | 129 ++++++++++++++++++
 .../xla/tests/manifest_checking_test.h        |  35 +++++
 tensorflow/compiler/xla/tests/test_macros.cc  |  89 +-----------
 tensorflow/compiler/xla/tests/test_macros.h   | 118 +---------------
 9 files changed, 201 insertions(+), 209 deletions(-)
 create mode 100644 tensorflow/compiler/xla/tests/manifest_checking_test.cc
 create mode 100644 tensorflow/compiler/xla/tests/manifest_checking_test.h

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index e1863a8a4cf..9b36117602b 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -52,16 +52,26 @@ cc_library(
     name = "test_macros_header",
     testonly = True,
     hdrs = ["test_macros.h"],
-    deps = [
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:test",
-        "@com_google_absl//absl/strings",
-    ],
 )
 
 # Generate a test_macros_${BACKEND} library per backend with the proper copts.
 generate_backend_test_macros()
 
+cc_library(
+    name = "manifest_checking_test",
+    testonly = True,
+    srcs = ["manifest_checking_test.cc"],
+    hdrs = ["manifest_checking_test.h"],
+    deps = [
+        ":test_macros_header",
+        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core/platform:logging",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "test_utils",
     srcs = ["test_utils.cc"],
@@ -136,6 +146,7 @@ cc_library(
     hdrs = ["hlo_test_base.h"],
     deps = [
         ":literal_test_util",
+        ":manifest_checking_test",
         ":test_utils",
         ":verified_hlo_module",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -193,6 +204,7 @@ cc_library(
     srcs = ["client_library_test_base.cc"],
     hdrs = ["client_library_test_base.h"],
     deps = [
+        ":manifest_checking_test",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
@@ -273,6 +285,7 @@ cc_library(
     hdrs = ["local_client_test_base.h"],
     deps = [
         ":client_library_test_base",
+        ":manifest_checking_test",
         ":verified_hlo_module",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index c0c0751b0de..94d870aa2ef 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -266,11 +266,6 @@ def generate_backend_test_macros(backends = []):
                 "-DXLA_DISABLED_MANIFEST=\\\"%s\\\"" % manifest,
             ],
             deps = [
-                "@com_google_absl//absl/container:flat_hash_map",
-                "@com_google_absl//absl/strings",
-                "//tensorflow/compiler/xla:types",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:regexp_internal",
-                "//tensorflow/core:test",
+                "//tensorflow/core/platform:logging",
             ],
         )
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 790497f888e..17bb70bdb42 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/bitmap.h"
@@ -62,7 +63,7 @@ std::vector<TestCase> ExpandUseBfloat16(
 }
 
 // A client library test establishes an in-process XLA client connection.
-class ClientLibraryTestBase : public ::testing::Test {
+class ClientLibraryTestBase : public ManifestCheckingTest {
  protected:
   explicit ClientLibraryTestBase(se::Platform* platform = nullptr);
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 85b1876dd3c..17c2a55ba5b 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
 #include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -67,7 +68,7 @@ namespace xla {
 //  )
 //
 // For a more detailed example, see "../tests/sample_text_test.cc".
-class HloTestBase : public ::testing::Test {
+class HloTestBase : public ManifestCheckingTest {
  public:
   // Creates a new HLO module for a test. The module created will have
   // TestName() for its name; it will also automatically populate its debug
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index ea457024618..c1951ad1021 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
 #include "tensorflow/compiler/xla/tests/verified_hlo_module.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -75,7 +76,7 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
 };
 
 // A base class for tests which exercise the LocalClient interface.
-class LocalClientTestBase : public ::testing::Test {
+class LocalClientTestBase : public ManifestCheckingTest {
  protected:
   struct EigenThreadPoolWrapper;
   explicit LocalClientTestBase(se::Platform* platform = nullptr);
diff --git a/tensorflow/compiler/xla/tests/manifest_checking_test.cc b/tensorflow/compiler/xla/tests/manifest_checking_test.cc
new file mode 100644
index 00000000000..8806290472d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/manifest_checking_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/manifest_checking_test.h"
+
+#include <fstream>
+#include <iterator>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/regexp.h"
+
+namespace xla {
+
+namespace {
+
+// Mapping from test name; i.e. MyTest.MyTestCase to platforms on which it is
+// disabled - a sequence of regexps.
+using ManifestT = absl::flat_hash_map<std::string, std::vector<std::string>>;
+
+ManifestT ReadManifest() {
+  ManifestT manifest;
+
+  absl::string_view path = absl::NullSafeStringView(kDisabledManifestPath);
+  if (path.empty()) {
+    return manifest;
+  }
+
+  // Note: parens are required to disambiguate vs function decl.
+  std::ifstream file_stream((std::string(path)));
+  std::string contents((std::istreambuf_iterator<char>(file_stream)),
+                       std::istreambuf_iterator<char>());
+
+  std::vector<std::string> lines = absl::StrSplit(contents, '\n');
+  for (std::string& line : lines) {
+    auto comment = line.find("//");
+    if (comment != std::string::npos) {
+      line = line.substr(0, comment);
+    }
+    if (line.empty()) {
+      continue;
+    }
+    absl::StripTrailingAsciiWhitespace(&line);
+    std::vector<std::string> pieces = absl::StrSplit(line, ' ');
+    CHECK_GE(pieces.size(), 1);
+    auto& platforms = manifest[pieces[0]];
+    for (size_t i = 1; i < pieces.size(); ++i) {
+      platforms.push_back(pieces[i]);
+    }
+  }
+  return manifest;
+}
+
+}  // namespace
+
+void ManifestCheckingTest::SetUp() {
+  const testing::TestInfo* test_info =
+      testing::UnitTest::GetInstance()->current_test_info();
+  absl::string_view test_case_name = test_info->test_suite_name();
+  absl::string_view test_name = test_info->name();
+  VLOG(1) << "test_case_name: " << test_case_name;
+  VLOG(1) << "test_name: " << test_name;
+
+  // Remove the type suffix from the test case name.
+  if (const char* type_param = test_info->type_param()) {
+    VLOG(1) << "type_param: " << type_param;
+    size_t last_slash = test_case_name.rfind('/');
+    test_case_name = test_case_name.substr(0, last_slash);
+    VLOG(1) << "test_case_name: " << test_case_name;
+  }
+
+  // Remove the test instantiation name if it is present.
+  auto first_slash = test_case_name.find('/');
+  if (first_slash != test_case_name.npos) {
+    test_case_name.remove_prefix(first_slash + 1);
+    VLOG(1) << "test_case_name: " << test_case_name;
+  }
+
+  ManifestT manifest = ReadManifest();
+
+  // If the test name ends with a slash followed by one or more characters,
+  // strip that off.
+  auto last_slash = test_name.rfind('/');
+  if (last_slash != test_name.npos) {
+    test_name = test_name.substr(0, last_slash);
+    VLOG(1) << "test_name: " << test_name;
+  }
+
+  // First try full match: test_case_name.test_name
+  // If that fails, try to find just the test_case_name; this would disable all
+  // tests in the test case.
+  auto it = manifest.find(absl::StrCat(test_case_name, ".", test_name));
+  if (it == manifest.end()) {
+    it = manifest.find(test_case_name);
+    if (it == manifest.end()) {
+      return;
+    }
+  }
+
+  // Expect a full match vs. one of the platform regexps to disable the test.
+  const std::vector<std::string>& disabled_platforms = it->second;
+  auto platform_string = kTestPlatform;
+  for (const auto& s : disabled_platforms) {
+    if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
+      GTEST_SKIP();
+      return;
+    }
+  }
+
+  // We didn't hit in the disabled manifest entries, so don't disable it.
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/manifest_checking_test.h b/tensorflow/compiler/xla/tests/manifest_checking_test.h
new file mode 100644
index 00000000000..4f44ed76a3e
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/manifest_checking_test.h
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
+#define TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
+
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+// This class allows us to intercept the test name and use an arbitrary
+// heuristic to decide whether the test case should be disabled. We
+// determine whether the test case should be disabled by resolving the (test
+// case name, test name) in a manifest file.
+class ManifestCheckingTest : public ::testing::Test {
+ protected:
+  // This method runs before each test runs.
+  void SetUp() override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_TESTS_MANIFEST_CHECKING_TEST_H_
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index dc9ac7b684a..9e85af76e89 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -15,93 +15,18 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 
-#include <fstream>
-#include <streambuf>
-#include <string>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
-namespace {
 
-// Mapping from test name; i.e. MyTest.MyTestCase to platforms on which it is
-// disabled - a sequence of regexps.
-using ManifestT = absl::flat_hash_map<string, std::vector<string>>;
-
-ManifestT ReadManifest() {
-  ManifestT manifest;
-
-  string path = XLA_DISABLED_MANIFEST;
-  if (path.empty()) {
-    return manifest;
-  }
-
-  std::ifstream file_stream(path);
-  // Note: parens are required to disambiguate vs function decl.
-  string contents((std::istreambuf_iterator<char>(file_stream)),
-                  std::istreambuf_iterator<char>());
-
-  std::vector<string> lines = absl::StrSplit(contents, '\n');
-  for (string& line : lines) {
-    auto comment = line.find("//");
-    if (comment != string::npos) {
-      line = line.substr(0, comment);
-    }
-    if (line.empty()) {
-      continue;
-    }
-    absl::StripTrailingAsciiWhitespace(&line);
-    std::vector<string> pieces = absl::StrSplit(line, ' ');
-    CHECK_GE(pieces.size(), 1);
-    auto& platforms = manifest[pieces[0]];
-    for (int64 i = 1; i < pieces.size(); ++i) {
-      platforms.push_back(pieces[i]);
-    }
-  }
-  return manifest;
+static bool InitModule() {
+  kDisabledManifestPath = XLA_DISABLED_MANIFEST;
+  VLOG(1) << "kDisabledManifestPath: " << kDisabledManifestPath;
+  kTestPlatform = XLA_PLATFORM;
+  VLOG(1) << "kTestPlatform: " << kTestPlatform;
+  return false;
 }
 
-}  // namespace
-
-std::string PrependDisabledIfIndicated(absl::string_view test_case_name,
-                                       absl::string_view test_name) {
-  ManifestT manifest = ReadManifest();
-
-  // If the test name ends with a slash followed by one or more digits, strip
-  // that off; this is just a shard number, and matching on this would be
-  // unstable even if someone wanted to do it.
-  static LazyRE2 shard_num_pattern = {R"(/\d+$)"};
-  absl::string_view suffix;
-  if (RE2::PartialMatch(test_name, *shard_num_pattern, &suffix)) {
-    test_name.remove_suffix(suffix.size());
-  }
-
-  // First try full match: test_case_name.test_name
-  // If that fails, try to find just the test_case_name; this would disable all
-  // tests in the test case.
-  auto it = manifest.find(absl::StrCat(test_case_name, ".", test_name));
-  if (it == manifest.end()) {
-    it = manifest.find(test_case_name);
-    if (it == manifest.end()) {
-      return std::string(test_name);
-    }
-  }
-
-  // Expect a full match vs. one of the platform regexps to disable the test.
-  const std::vector<string>& disabled_platforms = it->second;
-  string platform_string = XLA_PLATFORM;
-  for (const auto& s : disabled_platforms) {
-    if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
-      return absl::StrCat("DISABLED_", test_name);
-    }
-  }
-
-  // We didn't hit in the disabled manifest entries, so don't disable it.
-  return std::string(test_name);
-}
+static bool module_initialized = InitModule();
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 33d2dff9721..f62bccbe850 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -28,12 +28,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TESTS_TEST_MACROS_H_
 #define TENSORFLOW_COMPILER_XLA_TESTS_TEST_MACROS_H_
 
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/test.h"
-
 #define DISABLED_ON_CPU(X) X
 #define DISABLED_ON_GPU(X) X
 #define DISABLED_ON_GPU_ROCM(X) X
@@ -79,117 +73,15 @@ limitations under the License.
 
 namespace xla {
 
-// Reads a disabled manifest file to resolve whether test cases should be
-// disabled on a particular platform. For a test that should be disabled,
-// returns DISABLED_ prepended to its name; otherwise returns the test name
-// unmodified.
-std::string PrependDisabledIfIndicated(absl::string_view test_case_name,
-                                       absl::string_view test_name);
+inline const char *kDisabledManifestPath = nullptr;
+inline const char *kTestPlatform = nullptr;
 
 }  // namespace xla
 
-// This is the internal "gtest" class instantiation -- it is identical to the
-// GTEST_TEST_ macro, except that we intercept the test name for potential
-// modification by PrependDisabledIfIndicated. That file can use an arbitrary
-// heuristic to decide whether the test case should be disabled, and we
-// determine whether the test case should be disabled by resolving the (test
-// case name, test name) in a manifest file.
-#define XLA_GTEST_TEST_(test_case_name, test_name, parent_class)             \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
-      : public parent_class {                                                \
-   public:                                                                   \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
-                                                                             \
-   private:                                                                  \
-    virtual void TestBody();                                                 \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;    \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
-                                                           test_name));      \
-  };                                                                         \
-                                                                             \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name,          \
-                                                    test_name)::test_info_ = \
-      ::testing::RegisterTest(                                               \
-          #test_case_name,                                                   \
-          ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)     \
-              .c_str(),                                                      \
-          nullptr, nullptr, __FILE__, __LINE__, []() -> parent_class* {      \
-            return new GTEST_TEST_CLASS_NAME_(test_case_name, test_name)();  \
-          });                                                                \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+#define XLA_TEST_F(test_fixture, test_name) TEST_F(test_fixture, test_name)
 
-// This is identical to the TEST_F macro from "gtest", but it potentially
-// disables the test based on an external manifest file, DISABLED_MANIFEST.
-//
-// Per usual, you can see what tests are available via --gunit_list_tests and
-// choose to run tests that have been disabled via the manifest via
-// --gunit_also_run_disabled_tests.
-#define XLA_TEST_F(test_fixture, test_name) \
-  XLA_GTEST_TEST_(test_fixture, test_name, test_fixture)
+#define XLA_TEST_P(test_case_name, test_name) TEST_P(test_case_name, test_name)
 
-// Likewise, this is identical to the TEST_P macro from "gtest", but
-// potentially disables the test based on the DISABLED_MANIFEST file.
-//
-// We have to wrap this in an outer layer so that any DISABLED_ON_* macros will
-// be properly expanded before the stringification occurs.
-#define XLA_TEST_P_IMPL_(test_case_name, test_name)                            \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                      \
-      : public test_case_name {                                                \
-   public:                                                                     \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                     \
-    virtual void TestBody();                                                   \
-                                                                               \
-   private:                                                                    \
-    static int AddToRegistry() {                                               \
-      ::testing::UnitTest::GetInstance()                                       \
-          ->parameterized_test_registry()                                      \
-          .GetTestCasePatternHolder<test_case_name>(                           \
-              #test_case_name,                                                 \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
-          ->AddTestPattern(                                                    \
-              #test_case_name,                                                 \
-              ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)   \
-                  .c_str(),                                                    \
-              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
-                  test_case_name, test_name)>());                              \
-      return 0;                                                                \
-    }                                                                          \
-    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,     \
-                                                           test_name));        \
-  };                                                                           \
-  int GTEST_TEST_CLASS_NAME_(test_case_name,                                   \
-                             test_name)::gtest_registering_dummy_ =            \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry();      \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-#define XLA_TEST_P(test_case_name, test_name) \
-  XLA_TEST_P_IMPL_(test_case_name, test_name)
-
-// This is identical to the TEST_F macro from "gtest", but it potentially
-// disables the test based on an external manifest file, DISABLED_MANIFEST.
-#define XLA_TYPED_TEST(CaseName, TestName)                                     \
-  template <typename gtest_TypeParam_>                                         \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                             \
-      : public CaseName<gtest_TypeParam_> {                                    \
-   private:                                                                    \
-    typedef CaseName<gtest_TypeParam_> TestFixture;                            \
-    typedef gtest_TypeParam_ TypeParam;                                        \
-    virtual void TestBody();                                                   \
-  };                                                                           \
-  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ =   \
-      ::testing::internal::TypeParameterizedTest<                              \
-          CaseName,                                                            \
-          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,    \
-                                                                  TestName)>,  \
-          GTEST_TYPE_PARAMS_(CaseName)>::                                      \
-          Register(                                                            \
-              "", ::testing::internal::CodeLocation(__FILE__, __LINE__),       \
-              #CaseName,                                                       \
-              ::xla::PrependDisabledIfIndicated(#CaseName, #TestName).c_str(), \
-              0);                                                              \
-  template <typename gtest_TypeParam_>                                         \
-  void GTEST_TEST_CLASS_NAME_(CaseName,                                        \
-                              TestName)<gtest_TypeParam_>::TestBody()
+#define XLA_TYPED_TEST(CaseName, TestName) TYPED_TEST(CaseName, TestName)
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_TEST_MACROS_H_

From 9c4b749b09b958c436e0681a4276b47fc9316a8a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 19:17:40 -0700
Subject: [PATCH 081/112] Internal change

PiperOrigin-RevId: 317230321
Change-Id: I043dae37768f6e9cf946d4db2a8c36123ed2d6d9
---
 tensorflow/core/platform/BUILD               |   7 -
 tensorflow/core/platform/tf32_utils.cc       |  30 --
 tensorflow/core/platform/tf32_utils.h        |  27 --
 tensorflow/python/BUILD                      |  11 -
 tensorflow/python/framework/config.py        |  31 ---
 tensorflow/python/util/tf32.cc               |  22 --
 tensorflow/stream_executor/cuda/BUILD        |   2 -
 tensorflow/stream_executor/cuda/cuda_blas.cc |  98 ++++---
 tensorflow/stream_executor/cuda/cuda_blas.h  |   8 +-
 tensorflow/stream_executor/cuda/cuda_dnn.cc  | 272 ++++++++-----------
 10 files changed, 172 insertions(+), 336 deletions(-)
 delete mode 100644 tensorflow/core/platform/tf32_utils.cc
 delete mode 100644 tensorflow/core/platform/tf32_utils.h
 delete mode 100644 tensorflow/python/util/tf32.cc

diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 33a1e7cfe0a..70bb8a89417 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -938,13 +938,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "tf32_utils",
-    srcs = ["tf32_utils.cc"],
-    hdrs = ["tf32_utils.h"],
-    copts = tf_copts(),
-)
-
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tf32_utils.cc
deleted file mode 100644
index d2f40ea161a..00000000000
--- a/tensorflow/core/platform/tf32_utils.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/tf32_utils.h"
-
-#include <atomic>
-
-namespace tensorflow {
-
-// Whether TensorFloat-32 should be used where supported.
-// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
-static std::atomic<bool> tf32_allowed{false};
-
-void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
-
-bool tf32_execution_allowed() { return tf32_allowed; }
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tf32_utils.h
deleted file mode 100644
index 7a158d00ad3..00000000000
--- a/tensorflow/core/platform/tf32_utils.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
-#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
-
-namespace tensorflow {
-
-void allow_tf32_execution(bool allowed);
-
-bool tf32_execution_allowed();
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 5f9e2dfb1ff..de9cf9a24c7 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -788,16 +788,6 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_python_pybind_extension(
-    name = "_pywrap_tf32_execution",
-    srcs = ["util/tf32.cc"],
-    module_name = "_pywrap_tf32_execution",
-    deps = [
-        "//tensorflow/core/platform:tf32_utils",
-        "@pybind11",
-    ],
-)
-
 tf_python_pybind_extension(
     name = "_pywrap_util_port",
     srcs = ["util/port_wrapper.cc"],
@@ -5688,7 +5678,6 @@ py_library(
         "//tensorflow:composite_tensor_whitelist",
     ],
     deps = [
-        ":_pywrap_tf32_execution",
         ":tf_decorator",
         ":tf_export",
         ":tf_stack",
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 544b6882618..9ff16f2a327 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -18,42 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python import _pywrap_tf32_execution
 from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-# No tf_export until TF is built against CUDA11 which is required for TF32.
-def tensor_float_32_execution_allowed():
-  """Get if TensorFloat-32 operations are enabled on supported hardware.
-
-  Returns:
-    True if TensorFloat-32 execution is enabled and False otherwise.
-  """
-  return _pywrap_tf32_execution.is_allowed()
-
-
-# No tf_export until TF is built against CUDA 11 which is required for TF32.
-def allow_tensor_float_32_execution(allowed):
-  """Allow use of TensorFloat-32 with float32 ops on supported hardware.
-
-  TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
-  TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
-  Internally, the inputs are cast to a custom representation with 10-bit
-  mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
-  executed using TensorCores with float32 accumulation. For more information,
-  see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
-
-  TensorFloat-32 execution is disabled by default, but this may change in a
-  future version.
-
-  Args:
-    allowed: whether to allow TensorFloat-32 execution
-  """
-  _pywrap_tf32_execution.allow(allowed)
-
-
 @tf_export('config.threading.get_intra_op_parallelism_threads')
 def get_intra_op_parallelism_threads():
   """Get number of threads used within an individual op for parallelism.
diff --git a/tensorflow/python/util/tf32.cc b/tensorflow/python/util/tf32.cc
deleted file mode 100644
index 7dece6ccdae..00000000000
--- a/tensorflow/python/util/tf32.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/tf32_utils.h"
-
-PYBIND11_MODULE(_pywrap_tf32_execution, m) {
-  m.def("allow", &tensorflow::allow_tf32_execution);
-  m.def("is_allowed", &tensorflow::tf32_execution_allowed);
-}
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 3a14be9ad50..c3cf9f5db15 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -251,7 +251,6 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:host_or_device_scalar",
@@ -357,7 +356,6 @@ cc_library(
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:tf32_utils",
         "//tensorflow/stream_executor:dnn",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index fcd0e7b16fb..c9f0fc462c9 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -49,7 +49,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@@ -102,6 +101,18 @@ static std::string ToString(cublasStatus_t status) {
   }
 }
 
+// Decide whether to enable TENSOR_OP_MATH
+static bool TensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUBLAS_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
 // cuBLAS has interfaces that permit pointers to be passed from either the host
 // memory space or the device memory space; however, you must instruct it as to
 // which address space those pointers are in with cublasSetPointerMode.
@@ -226,19 +237,6 @@ bool CUDABlas::Init() {
     return false;
   }
 
-  absl::MutexLock lock(&mu_);
-#if CUDA_VERSION >= 9000
-#if CUBLAS_VER_MAJOR >= 11
-  ret = cublasSetMathMode(blas_, CUBLAS_TF32_TENSOR_OP_MATH);
-#else
-  ret = cublasSetMathMode(blas_, CUBLAS_TENSOR_OP_MATH);
-#endif
-  if (ret != CUBLAS_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set cublas default math mode: " << ToString(ret);
-    return false;
-  }
-#endif
-
   return true;
 }
 
@@ -401,7 +399,7 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
 template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
-                                  Args... args) {
+                                  bool use_tensor_op_math, Args... args) {
   absl::MutexLock lock(&mu_);
 
   CHECK(blas_ != nullptr);
@@ -415,10 +413,10 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                            : CUBLAS_POINTER_MODE_DEVICE)) {
     return false;
   }
-#if CUBLAS_VER_MAJOR >= 11
+#if CUDA_VERSION >= 9000
   ScopedCublasMathMode math_mode{blas_};
-  if (!tensorflow::tf32_execution_allowed()) {
-    if (!math_mode.Init(CUBLAS_DEFAULT_MATH)) {
+  if (use_tensor_op_math) {
+    if (!math_mode.Init(CUBLAS_TENSOR_OP_MATH)) {
       return false;
     }
   }
@@ -1635,9 +1633,21 @@ bool CUDABlas::DoBlasGemm(
     }
   }
 
+  bool use_tensor_ops = false;
+#if CUDA_VERSION >= 9000
+  int cc_major, cc_minor;
+  stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                   &cc_minor);
+
+  // GPUs < sm_70 don't support tensor ops.
+  if (cc_major >= 7 && TensorOpMathEnabled()) {
+    use_tensor_ops = true;
+  }
+#endif
+
   return DoBlasInternalImpl(
       cublasSgemmEx, stream, true /* = pointer_mode_host */,
-      true /* = err_on_failure= */, CUDABlasTranspose(transa),
+      true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
       CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
       SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
       GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
@@ -1911,7 +1921,8 @@ static bool TensorOpsAvailable(int cc_major) {
   // strictly correct.  We can't simply enable it, though, as that would change
   // clients' behavior significantly: Using tensor ops on fp32 inputs cause them
   // to be rounded to fp16.
-  if (cc_major >= 7 && std::is_same<InType, Eigen::half>::value) {
+  if (cc_major >= 7 && TensorOpMathEnabled() &&
+      std::is_same<InType, Eigen::half>::value) {
     return true;
   }
 #endif
@@ -2259,8 +2270,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
           &cc_major, &cc_minor) &&
       cc_major >= 5) {
-    bool use_tensor_ops =
-        data_type == CUDA_R_16F || tensorflow::tf32_execution_allowed();
+    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
     cublasGemmAlgo_t algo =
         (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
     cudaDataType_t compute_type =
@@ -2274,7 +2284,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     bool ok;
     ok = DoBlasInternalImpl(
         AS_LAMBDA(cublasGemmBatchedEx), stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure */, CUDABlasTranspose(transa),
+        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
         b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
         batch_count, compute_type, algo);
@@ -2409,25 +2419,33 @@ bool CUDABlas::DoBlasGemmStridedBatched(
     int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
     int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
     int64 stride_c, int batch_count) {
-#if CUDA_VERSION >= 9010
+  bool use_tensor_ops = false;
+#if CUDA_VERSION >= 9000
   int cc_major, cc_minor;
   if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
-          &cc_major, &cc_minor) &&
-      cc_major >= 5) {
-    cublasGemmAlgo_t algo =
-        (cc_major >= 7 ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
-    bool ok = DoBlasInternalImpl(
-        AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
-        true /* = pointer_mode_host */, true /* = err_on_failure */,
-        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-        GpuMemory(a), CUDA_R_16F, lda, stride_a, GpuMemory(b), CUDA_R_16F, ldb,
-        stride_b, &beta, GpuMemoryMutable(c), CUDA_R_16F, ldc, stride_c,
-        batch_count, CUDA_R_32F, algo);
-    if (ok) {
-      return true;
+          &cc_major, &cc_minor)) {
+    // GPUs < sm_70 don't support tensor ops.
+    if (cc_major >= 7 && TensorOpMathEnabled()) {
+      use_tensor_ops = true;
     }
-    LOG(ERROR) << "failed BLAS call, see log for details";
-    return false;
+#if CUDA_VERSION >= 9010
+    if (cc_major >= 5) {
+      cublasGemmAlgo_t algo =
+          (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+      bool ok = DoBlasInternalImpl(
+          AS_LAMBDA(cublasGemmStridedBatchedEx), stream,
+          true /* = pointer_mode_host */, true /* = err_on_failure */,
+          use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
+          m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
+          GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
+          CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
+      if (ok) {
+        return true;
+      }
+      LOG(ERROR) << "failed BLAS call, see log for details";
+      return false;
+    }
+#endif
   }
 #endif
   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
@@ -2440,7 +2458,7 @@ bool CUDABlas::DoBlasGemmStridedBatched(
         reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
     bool ok = DoBlasInternalImpl(
         cublasSgemmEx, stream, true /* = pointer_mode_host */,
-        true /* = err_on_failure= */, CUDABlasTranspose(transa),
+        true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
         CUDABlasTranspose(transb), m, n, k, &alpha, a_matrix, SE_CUDA_DATA_HALF,
         lda, b_matrix, SE_CUDA_DATA_HALF, ldb, &beta, c_matrix,
         SE_CUDA_DATA_HALF, ldc);
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 556456c83db..817bdb72777 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -83,7 +83,7 @@ class CUDABlas : public blas::BlasSupport {
   template <typename FuncT, typename... Args>
   bool DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                           bool pointer_mode_host, bool err_on_failure,
-                          Args... args);
+                          bool use_tensor_op_math, Args... args);
 
   // Convenience functions that call DoBlasInternalImpl with different values
   // for err_on_failure.
@@ -91,7 +91,8 @@ class CUDABlas : public blas::BlasSupport {
   bool DoBlasInternal(FuncT cublas_func, Stream *stream, bool pointer_mode_host,
                       Args... args) {
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/true, args...);
+                              /*err_on_failure=*/true, /*use_tensor_ops=*/false,
+                              args...);
   }
   template <typename FuncT, typename... Args>
   bool DoBlasInternalFailureOK(FuncT cublas_func, Stream *stream,
@@ -99,7 +100,8 @@ class CUDABlas : public blas::BlasSupport {
     // Tensor ops are hard-coded off in this path, but can still be enabled with
     // a specific algorithm choice as in DoBlasGemmWithAlgorithmImpl().
     return DoBlasInternalImpl(cublas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/false, args...);
+                              /*err_on_failure=*/false,
+                              /*use_tensor_ops=*/false, args...);
   }
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 192bae91572..be18c989861 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/tf32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -602,6 +601,31 @@ class CudnnFilterDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
 };
 
+// A helper function to decide whether to enable the TENSOR_OP_MATH math type
+bool TensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled = false;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
+// A helper function to decide whether to enable the TENSOR_OP_MATH math type
+// for RNNs.
+bool RnnTensorOpMathEnabled() {
+  static bool is_enabled = [] {
+    bool is_disabled = false;
+    TF_CHECK_OK(
+        tensorflow::ReadBoolFromEnvVar("TF_DISABLE_CUDNN_RNN_TENSOR_OP_MATH",
+                                       /*default_val=*/false, &is_disabled));
+    return !is_disabled;
+  }();
+  return is_enabled;
+}
+
 // A helper function to decide whether to use
 // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
 // some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
@@ -706,6 +730,10 @@ class CudnnConvolutionDescriptor {
             : CUDNN_CROSS_CORRELATION,
         data_type));
 
+    // NOTE(benbarsdell): This only applies if tensor op math is enabled
+    //                      and algo selection is set to Default.
+    this->set_use_tensor_op_math(true);
+
 #if CUDNN_MAJOR >= 7
     VLOG(2) << "Requesting grouped convolution: "
             << convolution_descriptor.group_count();
@@ -717,15 +745,13 @@ class CudnnConvolutionDescriptor {
 #endif
   }
 
-  void set_use_tensor_op_math(bool use_tensor_op_math) {
+  void set_use_tensor_op_math(bool use_tensor_op_math) const {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
-#if CUDNN_VERSION >= 8000
-        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_FMA_MATH);
-#else
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
-#endif
-    CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
+    if (TensorOpMathEnabled()) {
+      CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
+    }
 #endif
   }
 
@@ -737,40 +763,6 @@ class CudnnConvolutionDescriptor {
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
 };
 
-// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
-// set
-static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
-  cudnnMathType_t math_type;
-  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
-#if CUDNN_VERSION >= 8000
-  return math_type != CUDNN_FMA_MATH;
-#else
-  return math_type == CUDNN_TENSOR_OP_MATH;
-#endif
-}
-
-static bool TensorOpMathAvailable(int cc_major) {
-  return cc_major >= 7 && CUDNN_VERSION >= 7000;
-}
-
-static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
-  int cc_major, cc_minor;
-  std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
-  if (!TensorOpMathAvailable(cc_major)) {
-    return false;
-  }
-  if (input_type == dnn::DataType::kFloat) {
-#if CUDNN_VERSION < 8000
-    return false;
-#else
-    if (!tensorflow::tf32_execution_allowed()) {
-      return false;
-    }
-#endif
-  }
-  return true;
-}
-
 // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
 // within a scope.
 class CudnnPoolingDescriptor {
@@ -1163,31 +1155,21 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // in profile mode, which is run with algorithms returned from
     // GetRnnAlgorithms() (which are non-default and explicitly set whether to
     // use tensor ops). CuDNN 7.2.1 fixed this issue
-    bool allow_tensor_ops =
-        data_type != CUDNN_DATA_FLOAT || tensorflow::tf32_execution_allowed();
-    bool use_tensor_ops;
-    if (algorithm_config.algorithm().has_value()) {
-      use_tensor_ops = algorithm_config.algorithm()->tensor_ops_enabled();
-    } else {
-      use_tensor_ops = CUDNN_VERSION >= 7201 && allow_tensor_ops;
-    }
-
-    if (use_tensor_ops && !allow_tensor_ops) {
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "Algo requests disallowed tensor op evaluation.");
-    }
-
-    cudnnMathType_t math_type;
-    if (use_tensor_ops) {
-      math_type = CUDNN_TENSOR_OP_MATH;
-    } else {
-#if CUDNN_VERSION >= 8000
-      math_type = CUDNN_FMA_MATH;
+    if (RnnTensorOpMathEnabled()) {
+      cudnnMathType_t math_type;
+      if (algorithm_config.algorithm().has_value()) {
+        math_type = algorithm_config.algorithm()->tensor_ops_enabled()
+                        ? CUDNN_TENSOR_OP_MATH
+                        : CUDNN_DEFAULT_MATH;
+      } else {
+#if CUDNN_VERSION >= 7201
+        math_type = CUDNN_TENSOR_OP_MATH;
 #else
-      math_type = CUDNN_DEFAULT_MATH;
-#endif  // CUDNN_VERSION >= 8000
+        math_type = CUDNN_DEFAULT_MATH;
+#endif  // CUDNN_VERSION >= 7201
+      }
+      CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
     }
-    CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
 #endif  // CUDNN_VERSION >= 7000
 
     return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
@@ -2578,11 +2560,10 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
-        "Mismatch between cudnn conv and algorithm descriptors.");
-  }
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2622,11 +2603,10 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
-        "Mismatch between cudnn conv and algorithm descriptors.");
-  }
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2668,11 +2648,10 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
     const CudnnTensorDescriptor& output_nd,
     const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
-  if (IsTensorMathOpSet(conv) != algorithm_desc.tensor_ops_enabled()) {
-    return port::Status(
-        port::error::INTERNAL,
-        "Mismatch between cudnn conv and algorithm descriptors.");
-  }
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2706,42 +2685,18 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
   return scratch_allocator->AllocateBytes(size_in_bytes);
 }
 
-port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
-                                  absl::optional<dnn::AlgorithmDesc> desc) {
-  bool use_tensor_ops;
-  if (desc.has_value()) {
-    use_tensor_ops = desc->tensor_ops_enabled();
-    if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
-      return port::Status(port::error::INVALID_ARGUMENT,
-                          "Algo requests disallowed tensor op evaluation.");
-    }
-  } else {
-    use_tensor_ops = IsTensorMathAllowed(stream, type);
-  }
-  return use_tensor_ops;
+static bool TensorOpMathAvailable(int cc_major) {
+  return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
 }
 
-cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
-dnn::DataType GetConvAccumulatorType(dnn::DataType data_type);
-
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    dnn::DataType element_type,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
-
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
-  bool use_tensor_ops;
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
-
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2754,7 +2709,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                         GetCudnnConvolutionForwardAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
+    int cc_major, cc_minor;
+    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+    algo_desc = dnn::AlgorithmDesc(
+        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
   }
 
   const auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
@@ -2778,9 +2736,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                      "Returned status: ", scratch_or.status().ToString()));
   }
 
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2791,19 +2746,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    dnn::DataType element_type,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
-  bool use_tensor_ops;
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
-
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2816,7 +2762,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
                         GetCudnnConvolutionBackwardDataAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
+    int cc_major, cc_minor;
+    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+    algo_desc = dnn::AlgorithmDesc(
+        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
   }
 
   const auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
@@ -2839,9 +2788,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -2852,19 +2798,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
-    dnn::DataType element_type,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
   absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
-  CudnnConvolutionDescriptor conv(
-      convolution_descriptor,
-      ToCudnnDataType(GetConvAccumulatorType(element_type)));
-  bool use_tensor_ops;
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
-
   if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
@@ -2877,7 +2814,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
                         GetCudnnConvolutionBackwardFilterAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(algo, use_tensor_ops);
+    int cc_major, cc_minor;
+    std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
+    algo_desc = dnn::AlgorithmDesc(
+        algo, /*use_tensor_ops=*/TensorOpMathAvailable(cc_major));
   }
 
   auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
@@ -2900,9 +2840,6 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
         "while a secondary algorithm is not provided.");
   }
 
-  SE_ASSIGN_OR_RETURN(use_tensor_ops,
-                      UseTensorOps(stream, element_type, algo_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
                                     output_nd, *algo_desc, scratch_allocator));
@@ -3067,32 +3004,35 @@ port::Status CudnnSupport::DoPrepareForConvolution(
   CudnnTensorDescriptor output_nd(
       output_descriptor,
       ToCudnnDataType(element_type, output_descriptor.layout()));
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   switch (kind) {
     case dnn::ConvolutionKind::FORWARD: {
-      SE_ASSIGN_OR_RETURN(*algorithm_desc,
-                          GetCudnnConvolutionForwardAlgorithm(
-                              stream, cudnn, algorithm_config, input_nd,
-                              filter_nd, element_type, convolution_descriptor,
-                              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionForwardAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_DATA: {
-      SE_ASSIGN_OR_RETURN(*algorithm_desc,
-                          GetCudnnConvolutionBackwardDataAlgorithm(
-                              stream, cudnn, algorithm_config, input_nd,
-                              filter_nd, element_type, convolution_descriptor,
-                              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionBackwardDataAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     case dnn::ConvolutionKind::BACKWARD_FILTER: {
-      SE_ASSIGN_OR_RETURN(*algorithm_desc,
-                          GetCudnnConvolutionBackwardFilterAlgorithm(
-                              stream, cudnn, algorithm_config, input_nd,
-                              filter_nd, element_type, convolution_descriptor,
-                              output_nd, scratch_allocator, scratch_memory));
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionBackwardFilterAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
       break;
     }
     default:
@@ -3121,9 +3061,8 @@ port::Status CudnnSupport::DoConvolve(
   auto accumulator_type = GetConvAccumulatorType(element_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
                                   ToCudnnDataType(accumulator_type));
-  SE_ASSIGN_OR_RETURN(bool use_tensor_ops,
-                      UseTensorOps(stream, element_type, algorithm_desc));
-  conv.set_use_tensor_op_math(use_tensor_ops);
+  // Set use_tensor_math param to correct value
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -3356,6 +3295,14 @@ port::Status CudnnSupport::DoConvolve(
   return port::Status::OK();
 }
 
+// A helper function to query if a CudnnConvolutionDescriptor has tensor_op_math
+// set
+static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
+  cudnnMathType_t math_type;
+  CHECK_CUDNN_OK(cudnnGetConvolutionMathType(conv.handle(), &math_type));
+  return math_type == CUDNN_TENSOR_OP_MATH;
+}
+
 template <typename ElementType, typename BiasType, typename ScaleType,
           typename OutputType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
@@ -3389,6 +3336,8 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       filter_descriptor,
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
@@ -3398,14 +3347,9 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
   SE_ASSIGN_OR_RETURN(
       dnn::AlgorithmDesc algo_desc,
       GetCudnnConvolutionForwardAlgorithm(
-          stream, cudnn, algorithm_config, conv_input_nd, filter,
-          dnn::ToDataType<ElementType>::value, convolution_descriptor,
+          stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
           output_nd, scratch_allocator, &scratch));
 
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
-  conv.set_use_tensor_op_math(algo_desc.tensor_ops_enabled());
-
   std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new GpuTimer(parent_));  // NOLINT
@@ -3536,7 +3480,9 @@ bool CudnnSupport::GetRnnAlgorithms(
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
 #if CUDNN_VERSION >= 7100
-    out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    if (RnnTensorOpMathEnabled()) {
+      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    }
 #endif
   }
   return true;

From 64f7bdd56a394ecae55c5006e483050569b9b136 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Thu, 18 Jun 2020 19:32:16 -0700
Subject: [PATCH 082/112] Disable tsan on InterpreterFlexTest and
 SelectiveBuiltInterpreterFlexTest

PiperOrigin-RevId: 317231748
Change-Id: I7ab662fd55024c0ed91bd78bfdc8e9206d78b3b6
---
 tensorflow/lite/delegates/flex/BUILD | 1 +
 tensorflow/lite/java/BUILD           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 42914bf5ab8..99bcf05ab4a 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -279,6 +279,7 @@ java_test(
         "no_oss",  # Currently requires --config=monolithic, b/118895218.
         # TODO(b/121204962): Re-enable test after fixing memory leaks.
         "noasan",
+        "notsan",  # TODO(b/158651814) Re-enable after fixing racing condition.
     ],
     test_class = "org.tensorflow.lite.InterpreterFlexTest",
     visibility = ["//visibility:private"],
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 738d66a0eb1..89be932ab4d 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -304,6 +304,7 @@ java_test(
         "no_oss",  # Currently requires --config=monolithic, b/118895218.
         # TODO(b/121204962): Re-enable test after fixing memory leaks.
         "noasan",
+        "notsan",  # TODO(b/158651814) Re-enable after fixing racing condition.
     ],
     test_class = "org.tensorflow.lite.InterpreterFlexTest",
     visibility = ["//visibility:private"],

From c159f1599548428660c80dada924d69f269384a3 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 18 Jun 2020 19:35:14 -0700
Subject: [PATCH 083/112] Fork the keras related tpu_strategy_test to keras
 integration test.

PiperOrigin-RevId: 317232048
Change-Id: If05867985ff1ff81ac45bb601b701ee68d4d5279
---
 tensorflow/python/distribute/BUILD            |  1 -
 .../python/distribute/tpu_strategy_test.py    | 19 -----
 .../python/keras/integration_test/BUILD       | 13 ++++
 .../integration_test/tpu_strategy_test.py     | 69 +++++++++++++++++++
 4 files changed, 82 insertions(+), 20 deletions(-)
 create mode 100644 tensorflow/python/keras/integration_test/tpu_strategy_test.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 7208807a18c..4d77c12f975 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -654,7 +654,6 @@ tpu_py_test(
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras",
     ],
 )
 
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 6dd7de500e4..400b12112d6 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
@@ -364,24 +363,6 @@ class TPUStrategyTest(test.TestCase):
         expected_result,
         strategy.experimental_local_results(train_step(next(input_iterator))))
 
-  def test_keras_metric_outside_strategy_scope_per_replica(self):
-    strategy = get_tpu_strategy()
-    metric = keras.metrics.Mean("test_metric", dtype=dtypes.float32)
-
-    dataset = dataset_ops.Dataset.range(strategy.num_replicas_in_sync *
-                                        2).batch(2)
-    dataset = strategy.experimental_distribute_dataset(dataset)
-
-    @def_function.function
-    def step_fn(i):
-      metric.update_state(i)
-
-    with self.assertRaisesRegex(ValueError, "Trying to run metric.update_state "
-                                            "in replica context"):
-      with strategy.scope():
-        for i in dataset:
-          strategy.run(step_fn, args=(i,))
-
   # TODO(b/145574622): Remove this test once it is re-enabled in values_test.py.
   def test_all_reduce_on_sync_on_read_variable(self):
     strategy = get_tpu_strategy()
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index 2ef775a190e..b23dcc59b97 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -2,6 +2,7 @@
 #   Contains Keras integration tests that verify with other TF high level APIs.
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
 package(
     default_visibility = [
@@ -91,3 +92,15 @@ cuda_py_test(
         "//tensorflow/python:extra_py_tests_deps",
     ],
 )
+
+tpu_py_test(
+    name = "tpu_strategy_test",
+    srcs = ["tpu_strategy_test.py"],
+    disable_experimental = True,
+    python_version = "PY3",
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
diff --git a/tensorflow/python/keras/integration_test/tpu_strategy_test.py b/tensorflow/python/keras/integration_test/tpu_strategy_test.py
new file mode 100644
index 00000000000..d24e96ae855
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/tpu_strategy_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPUStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+
+import tensorflow as tf
+
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
+flags.DEFINE_string("project", None, "Name of GCP project with TPU.")
+flags.DEFINE_string("zone", None, "Name of GCP zone with TPU.")
+
+
+def get_tpu_cluster_resolver():
+  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+      tpu=FLAGS.tpu,
+      zone=FLAGS.zone,
+      project=FLAGS.project,
+  )
+  return resolver
+
+
+def get_tpu_strategy():
+  resolver = get_tpu_cluster_resolver()
+  tf.config.experimental_connect_to_cluster(resolver)
+  tf.tpu.experimental.initialize_tpu_system(resolver)
+  return tf.distribute.experimental.TPUStrategy(resolver)
+
+
+class TpuStrategyTest(tf.test.TestCase):
+
+  def test_keras_metric_outside_strategy_scope_per_replica(self):
+    strategy = get_tpu_strategy()
+    metric = tf.keras.metrics.Mean("test_metric", dtype=tf.float32)
+
+    dataset = tf.data.Dataset.range(strategy.num_replicas_in_sync * 2).batch(2)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+
+    @tf.function
+    def step_fn(i):
+      metric.update_state(i)
+
+    with self.assertRaisesRegex(ValueError, "Trying to run metric.update_state "
+                                            "in replica context"):
+      with strategy.scope():
+        for i in dataset:
+          strategy.run(step_fn, args=(i,))
+
+
+if __name__ == "__main__":
+  tf.test.main()

From 4d54ef31394aefe270826790164edcc6d687bb63 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 20:02:06 -0700
Subject: [PATCH 084/112] Enable type annotations for python/ops.

PiperOrigin-RevId: 317234494
Change-Id: I49a24cd1e2127a3c7b0f2eb217cfe023ce5b439f
---
 tensorflow/python/ops/logging_ops.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 02fce277690..8ca63f55987 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -54,9 +54,11 @@ except NameError:
 # call relies on certain conditionals for its dependencies.  Use
 # control_flow_ops.Assert.
 
-# Assert and Print are special symbols in Python 2, so we must
-# have an upper-case version of them. When support for it is dropped,
-# we can allow lowercase.
+# Assert and Print are special symbols in python, so we must
+# have an upper-case version of them.
+#
+# For users with Python 3 or Python 2.7
+# with `from __future__ import print_function`, we could also allow lowercase.
 # See https://github.com/tensorflow/tensorflow/issues/18053
 
 
@@ -81,6 +83,11 @@ def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
     with jupyter notebook (printing to the notebook *server's* output, not into
     the notebook).
 
+  Additionally, to use tf.print in python 2.7, users must make sure to import
+  the following:
+
+  `from __future__ import print_function`
+
   Args:
     input_: A tensor passed through this op.
     data: A list of tensors to print out when op is evaluated.
@@ -141,6 +148,11 @@ def print_v2(*inputs, **kwargs):
   Python objects. Printed tensors will recursively show the first and last
   elements of each dimension to summarize.
 
+  @compatibility(python2)
+  In python 2.7, make sure to import the following:
+  `from __future__ import print_function`
+  @end_compatibility
+
   Example:
     Single-input usage:
 

From 7e6e549c461118fbefdb11d03adbc80c27109a8a Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Thu, 18 Jun 2020 20:03:31 -0700
Subject: [PATCH 085/112] Support packed variable in DistributedVariable. Add
 an option to enable packed variable in TPUStrategy.

PiperOrigin-RevId: 317234665
Change-Id: I09e806cb8261815cd87a6d98817556dd8f7e8ed7
---
 tensorflow/python/distribute/BUILD            |   6 +-
 .../python/distribute/checkpointing_test.py   |   2 +
 .../custom_training_loop_input_test.py        |   7 +-
 .../python/distribute/distribute_utils.py     |   3 +
 .../distribute/packed_distributed_variable.py |  13 +-
 .../packed_distributed_variable_test.py       |   6 +-
 .../distribute/saved_model_test_base.py       |   1 +
 .../distribute/strategy_combinations.py       |  20 +-
 tensorflow/python/distribute/tpu_strategy.py  |  50 ++-
 .../python/distribute/tpu_strategy_test.py    | 410 +++++++++---------
 tensorflow/python/distribute/tpu_values.py    |  41 +-
 tensorflow/python/distribute/values.py        |  53 ++-
 tensorflow/python/distribute/values_test.py   |  91 ++--
 tensorflow/python/distribute/values_util.py   |  12 +-
 tensorflow/python/tpu/tpu.py                  |   9 +-
 15 files changed, 454 insertions(+), 270 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 4d77c12f975..0062705126f 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -654,6 +654,7 @@ tpu_py_test(
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -787,6 +788,7 @@ py_library(
     name = "tpu_values",
     srcs = ["tpu_values.py"],
     deps = [
+        ":packed_distributed_variable",
         ":values",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -1602,7 +1604,7 @@ distribute_py_test(
     srcs = ["saved_model_save_load_test.py"],
     full_precision = True,
     main = "saved_model_save_load_test.py",
-    shard_count = 5,
+    shard_count = 7,
     tags = [
         "multi_and_single_gpu",
         "no_rocm",
@@ -1635,7 +1637,7 @@ distribute_py_test(
     srcs = ["saved_model_mixed_api_test.py"],
     full_precision = True,
     main = "saved_model_mixed_api_test.py",
-    shard_count = 5,
+    shard_count = 7,
     tags = [
         "multi_and_single_gpu",
         "no_rocm",
diff --git a/tensorflow/python/distribute/checkpointing_test.py b/tensorflow/python/distribute/checkpointing_test.py
index ad646905315..edd4c46c371 100644
--- a/tensorflow/python/distribute/checkpointing_test.py
+++ b/tensorflow/python/distribute/checkpointing_test.py
@@ -103,6 +103,7 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
           mode=["eager"]))
@@ -138,6 +139,7 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
           mode=["eager"]))
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index 5d1584f5aa7..e4f782810dd 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -197,7 +197,8 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testNestedOutput(self, distribution):
@@ -748,6 +749,10 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
           mode=["eager"]
       ))
   def testMultiDeviceDataCapturedFunction(self, distribution):
+    if getattr(distribution, "_enable_packed_variable_in_eager_mode", False):
+      self.skipTest(
+          "Dataset captured function doesn't support packed tensors yet "
+          "(b/145922293).")
     inputs = constant_op.constant([2., 3.])
     dataset = lambda _: dataset_ops.Dataset.from_tensor_slices(inputs).repeat(5)
     input_iterator = iter(
diff --git a/tensorflow/python/distribute/distribute_utils.py b/tensorflow/python/distribute/distribute_utils.py
index ccf19521718..14b934b4a0f 100644
--- a/tensorflow/python/distribute/distribute_utils.py
+++ b/tensorflow/python/distribute/distribute_utils.py
@@ -148,6 +148,9 @@ def select_replica_mirrored(replica_id, structured):
         raise TypeError(
             "Expected value to be mirrored across replicas: %s in %s." %
             (x, structured))
+      packed_var = getattr(x, "_packed_variable", None)
+      if packed_var is not None:
+        return packed_var
       return x.values[replica_id]
     else:
       return x
diff --git a/tensorflow/python/distribute/packed_distributed_variable.py b/tensorflow/python/distribute/packed_distributed_variable.py
index 62512cb4414..c249b8efc1c 100644
--- a/tensorflow/python/distribute/packed_distributed_variable.py
+++ b/tensorflow/python/distribute/packed_distributed_variable.py
@@ -42,7 +42,7 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
       name: Optional name for the variable. Defaults to `'Variable'` and gets
         uniquified automatically.
     """
-    if not context.executing_eagerly():
+    if not ops.executing_eagerly_outside_functions():
       raise ValueError(
           "PackedDistributedVariable should be created in eager mode.")
     if not distributed_variables:
@@ -84,6 +84,9 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
   def devices(self):
     return self._devices
 
+  def on_device(self, device):
+    return PackedVarAndDevice(self, device)
+
   def get_var_on_device(self, device):
     for i, d in enumerate(self._devices):
       if d == device:
@@ -100,7 +103,10 @@ class PackedDistributedVariable(resource_variable_ops.BaseResourceVariable):
 
   @property
   def handle(self):
-    return self._handle
+    if context.executing_eagerly():
+      return self.get_var_on_current_device().handle
+    else:
+      return self._handle
 
   def _read_variable_op(self):
     if context.executing_eagerly():
@@ -269,7 +275,8 @@ class PackedVarAndDevice(object):
 
   @property
   def handle(self):
-    return self._var.handle
+    with ops.device(self._device):
+      return self._var.handle
 
   @property
   def op(self):
diff --git a/tensorflow/python/distribute/packed_distributed_variable_test.py b/tensorflow/python/distribute/packed_distributed_variable_test.py
index d29d19960a5..ec2e476e4b8 100644
--- a/tensorflow/python/distribute/packed_distributed_variable_test.py
+++ b/tensorflow/python/distribute/packed_distributed_variable_test.py
@@ -46,7 +46,7 @@ class PackedDistributedVariableTest(test.TestCase):
       v1 = resource_variable_ops.ResourceVariable(2.0, name='var1')
 
     packed_var = packed_distributed_variable.PackedDistributedVariable([v0, v1])
-    self.assertTrue(packed_var.handle.is_packed)
+    self.assertFalse(packed_var.handle.is_packed)
     self.assertTrue(packed_var.is_initialized)
 
     with ops.device('/cpu:0'):
@@ -61,6 +61,7 @@ class PackedDistributedVariableTest(test.TestCase):
 
     @def_function.function
     def update_var():
+      self.assertTrue(packed_var.handle.is_packed)
       with ops.device('/cpu:0'):
         packed_var.assign_add(3.0).assign_sub(1.0)
         read0 = packed_var.value()
@@ -85,7 +86,7 @@ class PackedDistributedVariableTest(test.TestCase):
 
     packed_var0 = packed_distributed_variable.PackedVarAndDevice(
         packed_var, device0)
-    self.assertTrue(packed_var0.handle.is_packed)
+    self.assertFalse(packed_var0.handle.is_packed)
     self.assertAllEqual(math_ops.mul(packed_var0, 2.0), 2.0)
 
     packed_var1 = packed_distributed_variable.PackedVarAndDevice(
@@ -94,6 +95,7 @@ class PackedDistributedVariableTest(test.TestCase):
 
     @def_function.function
     def func():
+      self.assertTrue(packed_var.handle.is_packed)
       var0 = packed_distributed_variable.PackedVarAndDevice(packed_var, device0)
       var0.assign_add(3.0)
       var1 = packed_distributed_variable.PackedVarAndDevice(packed_var, device1)
diff --git a/tensorflow/python/distribute/saved_model_test_base.py b/tensorflow/python/distribute/saved_model_test_base.py
index e544e51cddd..70ea582baff 100644
--- a/tensorflow/python/distribute/saved_model_test_base.py
+++ b/tensorflow/python/distribute/saved_model_test_base.py
@@ -58,6 +58,7 @@ strategies = [
     strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
     strategy_combinations.mirrored_strategy_with_two_gpus,
     strategy_combinations.tpu_strategy,
+    strategy_combinations.tpu_strategy_packed_var,
     strategy_combinations.central_storage_strategy_with_two_gpus,
 ]
 
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 350b187f67f..1fa42cb8645 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -53,7 +53,11 @@ _did_connect_to_cluster = False
 
 
 # pylint: disable=missing-docstring
-def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
+def _get_tpu_strategy_creator(steps_per_run,
+                              use_single_core=False,
+                              enable_packed_variable=False,
+                              **kwargs):
+
   def _create_tpu_strategy():
     global _did_connect_to_cluster
 
@@ -87,10 +91,13 @@ def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
 
     # Steps per run is only supported in TF 1.x
     if tf2.enabled():
-      return tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs)
+      strategy = tpu_lib.TPUStrategy(resolver, device_assignment, **kwargs)
     else:
-      return tpu_lib.TPUStrategyV1(resolver, steps_per_run,
-                                   device_assignment, **kwargs)
+      strategy = tpu_lib.TPUStrategyV1(resolver, steps_per_run,
+                                       device_assignment, **kwargs)
+    strategy._enable_packed_variable_in_eager_mode = enable_packed_variable  # pylint: disable=protected-access
+    return strategy
+
   return _create_tpu_strategy
 
 
@@ -117,6 +124,10 @@ one_device_strategy_gpu_on_worker_1 = combinations.NamedDistribution(
     required_gpus=1)
 tpu_strategy = combinations.NamedDistribution(
     "TPU", _get_tpu_strategy_creator(steps_per_run=2), required_tpu=True)
+tpu_strategy_packed_var = combinations.NamedDistribution(
+    "TPUPackedVar",
+    _get_tpu_strategy_creator(steps_per_run=2, enable_packed_variable=True),
+    required_tpu=True)
 tpu_strategy_one_step = combinations.NamedDistribution(
     "TPUOneStep", _get_tpu_strategy_creator(steps_per_run=1), required_tpu=True)
 tpu_strategy_one_core = combinations.NamedDistribution(
@@ -286,6 +297,7 @@ strategies_minus_default_and_tpu = [
 tpu_strategies = [
     tpu_strategy,  # steps_per_run=2
     tpu_strategy_one_step,
+    tpu_strategy_packed_var,
     cloud_tpu_strategy,
 ]
 
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 9493ecce767..7e8f5b97e7e 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -141,6 +141,10 @@ class TPUStrategy(distribute_lib.Strategy):
         "num_workers").set(self.extended.num_hosts)
     distribute_lib.distribution_strategy_replica_gauge.get_cell(
         "num_replicas_per_worker").set(self.extended.num_replicas_per_host)
+    # Packed variable is used to reduce the overhead of function execution.
+    # For a DistributedVariable, only one variable handle is captured into a
+    # function graph. It's only supported in eager mode.
+    self._enable_packed_variable_in_eager_mode = False
 
   # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
   # can use the default implementation.
@@ -185,6 +189,10 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
         "num_workers").set(self.extended.num_hosts)
     distribute_lib.distribution_strategy_replica_gauge.get_cell(
         "num_replicas_per_worker").set(self.extended.num_replicas_per_host)
+    # Packed variable is used to reduce the overhead of function execution.
+    # For a DistributedVariable, only one variable handle is captured into a
+    # function graph. It's only supported in eager mode.
+    self._enable_packed_variable_in_eager_mode = False
 
   @property
   def steps_per_run(self):
@@ -671,20 +679,29 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       return cross_device_ops_lib.reduce_non_distributed_value(
           reduce_op, value, destinations, self._num_replicas_in_sync)
 
+    value_list = value.values
+    # pylint: disable=protected-access
+    if isinstance(
+        value,
+        values.DistributedVariable) and value._packed_variable is not None:
+      value_list = tuple(
+          value._packed_variable.on_device(d)
+          for d in value._packed_variable.devices)
+    # pylint: enable=protected-access
+
     # Currently XLA op by op mode has a limit for the number of inputs for a
     # single op, thus we break one `add_n` op into a group of `add_n` ops to
     # work around the constraint.
     # TODO(cjfj): Detect when it is possible to use `cross_replica_sum`.
     if len(value.values) <= _XLA_OP_BY_OP_INPUTS_LIMIT:
-      output = math_ops.add_n(value.values)
+      output = math_ops.add_n(value_list)
     else:
-      output = array_ops.zeros_like(
-          value.values[0], dtype=value.values[0].dtype)
-      for i in range(0, len(value.values), _XLA_OP_BY_OP_INPUTS_LIMIT):
-        output += math_ops.add_n(value.values[i:i + _XLA_OP_BY_OP_INPUTS_LIMIT])
+      output = array_ops.zeros_like(value_list[0], dtype=value_list[0].dtype)
+      for i in range(0, len(value_list), _XLA_OP_BY_OP_INPUTS_LIMIT):
+        output += math_ops.add_n(value_list[i:i + _XLA_OP_BY_OP_INPUTS_LIMIT])
 
     if reduce_op == reduce_util.ReduceOp.MEAN:
-      output *= (1. / len(value.values))
+      output *= (1. / len(value_list))
 
     devices = cross_device_ops_lib.get_devices_from(destinations)
 
@@ -710,17 +727,28 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
       else:
         return (fn(var, *args, **kwargs),)
 
-    # Otherwise, we revert to MirroredStrategy behavior and update each variable
-    # directly.
+    # Otherwise, we revert to MirroredStrategy behavior and update the variable
+    # on each replica directly.
     updates = []
-    for i, v in enumerate(var.values):
+    values_and_devices = []
+    packed_var = var._packed_variable  # pylint: disable=protected-access
+    if packed_var is not None:
+      for device in packed_var.devices:
+        values_and_devices.append((packed_var, device))
+    else:
+      for value in var.values:
+        values_and_devices.append((value, value.device))
+
+    for i, value_and_device in enumerate(values_and_devices):
+      value = value_and_device[0]
+      device = value_and_device[1]
       name = "update_%d" % i
-      with ops.device(v.device), \
+      with ops.device(device), \
            distribute_lib.UpdateContext(i), \
            ops.name_scope(name):
         # If args and kwargs are not mirrored, the value is returned as is.
         updates.append(
-            fn(v, *distribute_utils.select_replica_mirrored(i, args),
+            fn(value, *distribute_utils.select_replica_mirrored(i, args),
                **distribute_utils.select_replica_mirrored(i, kwargs)))
     return distribute_utils.update_regroup(self, updates, group)
 
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 400b12112d6..4070336aae8 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
@@ -64,14 +66,17 @@ def get_tpu_cluster_resolver():
   return resolver
 
 
-def get_tpu_strategy():
+def get_tpu_strategy(enable_packed_var=False):
   resolver = get_tpu_cluster_resolver()
   remote.connect_to_cluster(resolver)
   tpu_strategy_util.initialize_tpu_system(resolver)
-  return tpu_lib.TPUStrategy(resolver)
+  strategy = tpu_lib.TPUStrategy(resolver)
+  strategy._enable_packed_variable_in_eager_mode = enable_packed_var
+  return strategy
 
 
-class TPUStrategyTest(test.TestCase):
+# TPU tests which don't use TPUStrategy.
+class TPUTest(test.TestCase):
 
   def test_multiple_initialize_system(self):
     resolver = get_tpu_cluster_resolver()
@@ -82,177 +87,6 @@ class TPUStrategyTest(test.TestCase):
       tpu_strategy_util.initialize_tpu_system(resolver)
       self.assertRegex(str(mock_log.call_args), "already been initialized")
 
-  def test_sequential_experimental_runs(self):
-    resolver = get_tpu_cluster_resolver()
-    remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
-    # Computation replicated to all cores.
-    device_assignment = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=2)
-    strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
-
-    # Computation on the 1st core.
-    device_assignment2 = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=1)
-    strategy2 = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
-
-    def computation(x):
-      return math_ops.square(x)
-
-    @def_function.function
-    def train_step():
-      outputs = strategy.experimental_local_results(
-          strategy.run(computation, args=([2., 2.],)))
-      outputs2 = strategy2.run(
-          computation, args=([outputs[0]],))
-      return outputs2
-
-    self.assertAllEqual([[16., 16.]], train_step())
-
-  def test_device_switch_case(self):
-    strategy = get_tpu_strategy()
-    with strategy.scope():
-      a = variables.Variable(1)
-
-    inference_iteration = variables.Variable(-1)
-
-    def inference_fn(x, i):
-      return a + x + i
-
-    @def_function.function
-    def run_inference(x):
-
-      def do_inference(device, inference_fn, i):
-        with ops.device(device):
-          return inference_fn(x, i)
-
-      branch_fns = {
-          0: (lambda: do_inference("/device:TPU:0", inference_fn, 0)),
-          1: (lambda: do_inference("/device:TPU:1", inference_fn, 1)),
-      }
-      branch_index = inference_iteration.assign_add(1, use_locking=True) % 2
-      return control_flow_ops.switch_case(branch_index, branch_fns)
-
-    self.assertAllEqual(2., run_inference(1))  # Use TPU core 0.
-    self.assertAllEqual(3., run_inference(1))  # Use TPU core 1.
-
-  def test_recover_from_compilation_failures(self):
-    # TODO(b/148150981): Stop skipping this test once recovery works
-    # for non-local TPU.
-    if FLAGS.tpu:
-      self.skipTest("Recovery fails for non-local TPU, see b/148150981")
-
-    # Disable automatic outside compilation.
-    config.set_soft_device_placement(False)
-    strategy = get_tpu_strategy()
-
-    @def_function.function
-    def compilation_failure_run():
-
-      def computation():
-        return random_ops.random_gamma([10], [0.5, 1.5])
-
-      return strategy.run(computation)
-
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "TPU compilation failed"):
-      compilation_failure_run()
-
-    @def_function.function
-    def good_run():
-
-      def computation():
-        return random_ops.random_normal([10])
-
-      return strategy.run(computation)
-
-    good_run()
-
-  def test_dynamic_shape_with_outside_compilation_failure(self):
-    # Enable automatic outside compilation.
-    config.set_soft_device_placement(True)
-    strategy = get_tpu_strategy()
-    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
-        2, drop_remainder=False)
-    dataset = strategy.experimental_distribute_dataset(dataset)
-    iterator = iter(dataset)
-
-    @def_function.function
-    def train_fn(iterator):
-
-      def step_fn(inputs):
-        _, inputs = inputs
-        return math_ops.reduce_sum(inputs)
-
-      return strategy.experimental_local_results(
-          strategy.run(step_fn, args=(next(iterator),)))
-
-    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
-      logging.info(train_fn(iterator))
-
-  def test_computation_on_subset_cores(self):
-    resolver = get_tpu_cluster_resolver()
-    remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
-    all_core_strategy = tpu_lib.TPUStrategy(resolver)
-
-    with all_core_strategy.scope():
-      v = variables.Variable(0.0,
-                             aggregation=variables.VariableAggregation.MEAN)
-
-    # Computation on the 1st core.
-    device_assignment = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=1)
-    first_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
-
-    # Computation on the 2nd core.
-    device_assignment2 = device_assignment_lib.DeviceAssignment(
-        topology, [[[0, 0, 0, 1]]])
-    second_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
-
-    @def_function.function
-    def train_step():
-
-      def step_fn():
-        return v + 1.0
-
-      all_core_strategy.run(step_fn)
-      r1 = first_core_strategy.run(step_fn)
-      r2 = second_core_strategy.run(step_fn)
-      return r1 + r2
-
-    train_step()
-    self.assertAllEqual(2., train_step())
-
-  def test_worker_devices_on_subset_cores(self):
-    resolver = get_tpu_cluster_resolver()
-    remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
-
-    # Strategy for the 1st core.
-    device_assignment = device_assignment_lib.DeviceAssignment.build(
-        topology, num_replicas=1)
-    first_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment)
-
-    # Strategy for the 2nd core.
-    device_assignment2 = device_assignment_lib.DeviceAssignment(
-        topology, [[[0, 0, 0, 1]]])
-    second_core_strategy = tpu_lib.TPUStrategy(
-        resolver, device_assignment=device_assignment2)
-
-    self.assertLen(first_core_strategy.extended.worker_devices, 1)
-    self.assertEndsWith(first_core_strategy.extended.worker_devices[0],
-                        "device:TPU:0")
-
-    self.assertLen(second_core_strategy.extended.worker_devices, 1)
-    self.assertEndsWith(second_core_strategy.extended.worker_devices[0],
-                        "device:TPU:1")
-
   def test_tpu_tf_function_same_device(self):
     with ops.device("/device:TPU:0"):
       a = variables.Variable(1)
@@ -288,8 +122,194 @@ class TPUStrategyTest(test.TestCase):
       result = bar() + 1
       self.assertAllEqual(result, 2)
 
-  def test_control_output_in_while_body_fn(self):
-    strategy = get_tpu_strategy()
+
+@parameterized.named_parameters([("PackedVar", True), ("", False)])
+class TPUStrategyTest(test.TestCase, parameterized.TestCase):
+
+  def test_sequential_experimental_runs(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    # Computation replicated to all cores.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=2)
+    strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+    strategy._enable_packed_variable_in_eager_mode = enable_packed_var
+
+    # Computation on the 1st core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    strategy2 = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+
+    def computation(x):
+      return math_ops.square(x)
+
+    @def_function.function
+    def train_step():
+      outputs = strategy.experimental_local_results(
+          strategy.run(computation, args=([2., 2.],)))
+      outputs2 = strategy2.run(
+          computation, args=([outputs[0]],))
+      return outputs2
+
+    self.assertAllEqual([[16., 16.]], train_step())
+
+  def test_device_switch_case(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
+    with strategy.scope():
+      a = variables.Variable(1)
+
+    inference_iteration = variables.Variable(-1)
+
+    def inference_fn(x, i):
+      return a + x + i
+
+    @def_function.function
+    def run_inference(x):
+
+      def do_inference(device, inference_fn, i):
+        with ops.device(device):
+          return inference_fn(x, i)
+
+      branch_fns = {
+          0: (lambda: do_inference("/device:TPU:0", inference_fn, 0)),
+          1: (lambda: do_inference("/device:TPU:1", inference_fn, 1)),
+      }
+      branch_index = inference_iteration.assign_add(1, use_locking=True) % 2
+      return control_flow_ops.switch_case(branch_index, branch_fns)
+
+    self.assertAllEqual(2., run_inference(1))  # Use TPU core 0.
+    self.assertAllEqual(3., run_inference(1))  # Use TPU core 1.
+
+  def test_recover_from_compilation_failures(self, enable_packed_var):
+    # TODO(b/148150981): Stop skipping this test once recovery works
+    # for non-local TPU.
+    if FLAGS.tpu:
+      self.skipTest("Recovery fails for non-local TPU, see b/148150981")
+
+    # Disable automatic outside compilation.
+    config.set_soft_device_placement(False)
+    strategy = get_tpu_strategy(enable_packed_var)
+
+    @def_function.function
+    def compilation_failure_run():
+
+      def computation():
+        return random_ops.random_gamma([10], [0.5, 1.5])
+
+      return strategy.run(computation)
+
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "TPU compilation failed"):
+      compilation_failure_run()
+
+    @def_function.function
+    def good_run():
+
+      def computation():
+        return random_ops.random_normal([10])
+
+      return strategy.run(computation)
+
+    good_run()
+
+  def test_dynamic_shape_with_outside_compilation_failure(
+      self, enable_packed_var):
+    # Enable automatic outside compilation.
+    config.set_soft_device_placement(True)
+    strategy = get_tpu_strategy(enable_packed_var)
+    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
+        2, drop_remainder=False)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+    iterator = iter(dataset)
+
+    @def_function.function
+    def train_fn(iterator):
+
+      def step_fn(inputs):
+        _, inputs = inputs
+        return math_ops.reduce_sum(inputs)
+
+      return strategy.experimental_local_results(
+          strategy.run(step_fn, args=(next(iterator),)))
+
+    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
+      logging.info(train_fn(iterator))
+
+  def test_computation_on_subset_cores(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    all_core_strategy = tpu_lib.TPUStrategy(resolver)
+    all_core_strategy._enable_packed_variable_in_eager_mode = enable_packed_var
+
+    with all_core_strategy.scope():
+      v = variables.Variable(0.0,
+                             aggregation=variables.VariableAggregation.MEAN)
+
+    # Computation on the 1st core.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    first_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+    first_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    # Computation on the 2nd core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment(
+        topology, [[[0, 0, 0, 1]]])
+    second_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+    second_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    @def_function.function
+    def train_step():
+
+      def step_fn():
+        return v + 1.0
+
+      all_core_strategy.run(step_fn)
+      r1 = first_core_strategy.run(step_fn)
+      r2 = second_core_strategy.run(step_fn)
+      return r1 + r2
+
+    train_step()
+    self.assertAllEqual(2., train_step())
+
+  def test_worker_devices_on_subset_cores(self, enable_packed_var):
+    resolver = get_tpu_cluster_resolver()
+    remote.connect_to_cluster(resolver)
+    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+
+    # Strategy for the 1st core.
+    device_assignment = device_assignment_lib.DeviceAssignment.build(
+        topology, num_replicas=1)
+    first_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment)
+    first_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    # Strategy for the 2nd core.
+    device_assignment2 = device_assignment_lib.DeviceAssignment(
+        topology, [[[0, 0, 0, 1]]])
+    second_core_strategy = tpu_lib.TPUStrategy(
+        resolver, device_assignment=device_assignment2)
+    second_core_strategy._enable_packed_variable_in_eager_mode = (
+        enable_packed_var)
+
+    self.assertLen(first_core_strategy.extended.worker_devices, 1)
+    self.assertEndsWith(first_core_strategy.extended.worker_devices[0],
+                        "device:TPU:0")
+
+    self.assertLen(second_core_strategy.extended.worker_devices, 1)
+    self.assertEndsWith(second_core_strategy.extended.worker_devices[0],
+                        "device:TPU:1")
+
+  def test_control_output_in_while_body_fn(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     with strategy.scope():
       v = variables.Variable(
@@ -307,8 +327,8 @@ class TPUStrategyTest(test.TestCase):
     train_step()
     self.assertEqual(2.0, v.numpy())
 
-  def test_cluster_in_graph_and_while_body_fn(self):
-    strategy = get_tpu_strategy()
+  def test_cluster_in_graph_and_while_body_fn(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     @def_function.function
     def train_step():
@@ -328,8 +348,8 @@ class TPUStrategyTest(test.TestCase):
     sum_val = train_step().numpy().astype(float)
     self.assertEqual(sum_val, strategy.num_replicas_in_sync * 10)
 
-  def test_two_clusters_with_same_fn(self):
-    strategy = get_tpu_strategy()
+  def test_two_clusters_with_same_fn(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     @def_function.function
     def foo(x):
@@ -342,8 +362,8 @@ class TPUStrategyTest(test.TestCase):
 
     bar(1)
 
-  def test_using_external_variable_inside_tf_function(self):
-    strategy = get_tpu_strategy()
+  def test_using_external_variable_inside_tf_function(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     dataset = dataset_ops.Dataset.range(
         strategy.num_replicas_in_sync * 2,
         output_type=dtypes.float32).batch(strategy.num_replicas_in_sync)
@@ -364,8 +384,8 @@ class TPUStrategyTest(test.TestCase):
         strategy.experimental_local_results(train_step(next(input_iterator))))
 
   # TODO(b/145574622): Remove this test once it is re-enabled in values_test.py.
-  def test_all_reduce_on_sync_on_read_variable(self):
-    strategy = get_tpu_strategy()
+  def test_all_reduce_on_sync_on_read_variable(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     dataset = dataset_ops.Dataset.range(
         strategy.num_replicas_in_sync, output_type=dtypes.float32).batch(
             strategy.num_replicas_in_sync, drop_remainder=True)
@@ -404,8 +424,8 @@ class TPUStrategyTest(test.TestCase):
     self.assertAllEqual((0.,), w.read_value())
 
   # TODO(b/140633529): Re-enable the test.
-  def disable_test_experimental_run_output_on_device(self):
-    strategy = get_tpu_strategy()
+  def disable_test_experimental_run_output_on_device(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
 
     def computation(x):
       return math_ops.square(x)
@@ -423,8 +443,8 @@ class TPUStrategyTest(test.TestCase):
     self.assertAllEqual("/job:localhost/replica:0/task:0/device:TPU:1",
                         results[1].backing_device)
 
-  def test_composite_input(self):
-    strategy = get_tpu_strategy()
+  def test_composite_input(self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     if strategy.num_replicas_in_sync != 2:
       self.skipTest("Test assumes two replicas.")
 
@@ -463,8 +483,9 @@ class TPUStrategyTest(test.TestCase):
     self.assertAllEqual(result,
                         [[[0.0, 1.0], [3.0, 8.0]], [[0.0, 1.0], [3.0, 8.0]]])
 
-  def test_composite_input_dynamic_shapes_outside_compilation(self):
-    strategy = get_tpu_strategy()
+  def test_composite_input_dynamic_shapes_outside_compilation(
+      self, enable_packed_var):
+    strategy = get_tpu_strategy(enable_packed_var)
     if strategy.num_replicas_in_sync != 2:
       self.skipTest("Test assumes two replicas.")
 
@@ -506,11 +527,11 @@ class TPUStrategyTest(test.TestCase):
     result = sparse_lookup(dataset)
     self.assertAllEqual(result, [[0.0, 2.0], [1.5, 5.0]])
 
-  def test_per_device_tracing_of_mirrored_variables(self):
+  def test_per_device_tracing_of_mirrored_variables(self, enable_packed_var):
     # Define trace_count as a list to avoid python scoping error
     trace_count = [0]
 
-    strategy = get_tpu_strategy()
+    strategy = get_tpu_strategy(enable_packed_var)
     with strategy.scope():
       variable = variables.Variable(0.0)
 
@@ -527,7 +548,10 @@ class TPUStrategyTest(test.TestCase):
 
     with strategy.scope():
       update_variable.get_concrete_function()
-      self.assertEqual(trace_count[0], len(strategy.extended.worker_devices))
+      self.assertLen(strategy.extended.worker_devices, trace_count[0])
+
+
+class TPUStrategyDataPrefetchTest(test.TestCase):
 
   def test_prefetch_to_device_default(self):
     strategy = get_tpu_strategy()
diff --git a/tensorflow/python/distribute/tpu_values.py b/tensorflow/python/distribute/tpu_values.py
index 40ab058ac7c..33885531966 100644
--- a/tensorflow/python/distribute/tpu_values.py
+++ b/tensorflow/python/distribute/tpu_values.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 import contextlib
 
+from tensorflow.python.distribute import packed_distributed_variable as packed
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
@@ -46,15 +47,27 @@ def _maybe_enter_graph(tensor):
       yield
 
 
+@contextlib.contextmanager
+def _maybe_on_device(var):
+  # Add a device scope for packed variables.
+  if isinstance(var, packed.PackedVarAndDevice):
+    with ops.device(var.device):
+      yield
+  else:
+    yield
+
+
 def _make_raw_assign_fn(raw_assign_fn):  # pylint: disable=missing-docstring
 
   def assign_fn(var, value, use_locking=False, name=None, read_value=True):  # pylint: disable=missing-docstring
     del use_locking  # Unused.
 
-    with _maybe_enter_graph(var.handle):
+    handle = var.handle
+    with _maybe_enter_graph(handle), _maybe_on_device(var):
       op = raw_assign_fn(
-          var.handle, ops.convert_to_tensor(value, dtype=var.dtype), name=name)
-
+          handle,
+          ops.convert_to_tensor(value, dtype=var.dtype),
+          name=name)
       with ops.control_dependencies([op]):
         return var._read_variable_op() if read_value else op  # pylint: disable=protected-access
 
@@ -97,23 +110,37 @@ class TPUVariableMixin(object):
 
   @property
   def handle(self):
+    """The handle by which this variable can be accessed."""
     # If we're in a tpu.rewrite(), return the replicated handle.
     tpu_context = enclosing_tpu_context()
     if tpu_context is None or context.executing_eagerly():
       return self._get_on_device_or_primary().handle
     else:
-      return tpu_context.get_replicated_var_handle(self._handle_id,
-                                                   self._values,
-                                                   self._is_mirrored())
+      is_packed = self._packed_var is not None
+      val = self._values
+      if is_packed:
+        val = [self._packed_var]
+
+      return tpu_context.get_replicated_var_handle(self._handle_id, val,
+                                                   self._is_mirrored(),
+                                                   is_packed)
 
   @property
   def device(self):
     return self.handle.device
 
   def _read_variable_op(self):
+    """Reads the value of this variable."""
     if self.trainable:
       tape.variable_accessed(self)
-    return gen_resource_variable_ops.read_variable_op(self.handle, self.dtype)
+
+    handle = self.handle
+    if getattr(handle, "is_packed", False):
+      # Add a device scope for a packed variable handle.
+      with ops.device(self._get_on_device_or_primary().device):
+        return gen_resource_variable_ops.read_variable_op(handle, self.dtype)
+    else:
+      return gen_resource_variable_ops.read_variable_op(handle, self.dtype)
 
   def read_value(self):
     if enclosing_tpu_context() is None:
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 60b2ea4fe31..37643e03b18 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -472,6 +472,12 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     # variable.
     self._var_policy = var_policy
 
+  @property
+  def _devices(self):
+    if self._packed_var is not None:
+      return tuple(d for d in self._packed_var.devices)
+    return tuple(v.device for v in self._values)
+
   def is_initialized(self, name=None):
     """Identifies if all the component variables are initialized.
 
@@ -482,6 +488,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       The op that evaluates to True or False depending on if all the
       component variables are initialized.
     """
+    if self._packed_var is not None:
+      return self._packed_var.is_initialized()
     result = self._primary.is_initialized()
     # We iterate through the list of values except the last one to allow us to
     # name the final `logical_and` op the same name that is passed by the user
@@ -552,6 +560,10 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
   def aggregation(self):
     return self._aggregation
 
+  @property
+  def _packed_variable(self):
+    return self._packed_var
+
   @property
   def handle(self):
     replica_id = values_util.get_current_replica_id_as_int()
@@ -559,6 +571,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       raise ValueError("`handle` is not available outside the replica context"
                        " or a `tf.distribute.Strategy.update()` call.")
     else:
+      if self._packed_var is not None:
+        return self._packed_var.handle
       return self._values[replica_id].handle
 
   def eval(self, session=None):
@@ -607,6 +621,33 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
   def _in_graph_mode(self):
     return self._primary._in_graph_mode  # pylint: disable=protected-access
 
+  def _get_replica(self, replica_id):
+    """Returns the value on a device with the given replica_id."""
+    if self._packed_var is not None:
+      return self._packed_var.on_device(self._devices[replica_id])
+    return self._values[replica_id]
+
+  def _get(self):
+    """Returns the value for the current device or raises a ValueError."""
+    replica_id = values_util.get_current_replica_id_as_int()
+    if replica_id is None:
+      return self._get_cross_replica()
+    else:
+      return self._get_replica(replica_id)
+
+  def _get_on_device_or_primary(self):
+    """Returns value in same replica or device if possible, else the _primary."""
+    replica_id = values_util.get_current_replica_id_as_int()
+    if replica_id is None:
+      # Try to find a value on the current device.
+      current_device = device_util.canonicalize(device_util.current())
+      for i, value in enumerate(self._values):
+        if device_util.canonicalize(value.device) == current_device:
+          return self._get_replica(i)
+      return self._get_replica(0)
+    else:
+      return self._get_replica(replica_id)
+
   def read_value(self):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return array_ops.identity(self._get())
@@ -778,7 +819,8 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       if ds_context.in_cross_replica_context():
         update_replica_id = distribute_lib.get_update_replica_id()
         if update_replica_id is not None:
-          return update_fn(self._values[update_replica_id], value, **kwargs)
+          replica_value = self._get_replica(update_replica_id)
+          return update_fn(replica_value, value, **kwargs)
         return self._update_cross_replica(update_fn, value, **kwargs)
       else:
         values_util.assert_replica_context(self.distribute_strategy)
@@ -802,6 +844,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       obj_map[v] = new_obj
       resource_map[v.handle] = new_obj.handle
     obj_map[self] = new_obj
+    resource_map[self.handle] = new_obj.handle
     resource_map[self] = new_obj.handle
     return obj_map, resource_map
 
@@ -835,6 +878,12 @@ class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
     tensor, = restored_tensors
+    packed_var = self._mirrored_variable._packed_variable  # pylint: disable=protected-access
+    if packed_var is not None:
+      return control_flow_ops.group(
+          tuple(
+              values_util.assign_on_device(d, packed_var, tensor)
+              for d in packed_var.devices))
     return control_flow_ops.group(
         tuple(
             values_util.assign_on_device(v.device, v, tensor)
@@ -1013,7 +1062,7 @@ class SyncOnReadVariable(DistributedVariable):
 
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self._primary
+      return self._get_replica(0)
 
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 0cb4d6ddd2a..d0e3eec22a8 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
@@ -234,11 +233,11 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               # TODO(b/137795644): support CentralStroageStrategy
               # strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
-          mode=["eager"]
-      ))
+          mode=["eager"]))
   def testMakeDistributedValueDefaultDevicePlacement(self, distribution):
     if not tf2.enabled():
       self.skipTest("Only V2 is supported.")
@@ -259,11 +258,11 @@ class DistributedValuesTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
               # TODO(b/137795644): support CentralStroageStrategy
               # strategy_combinations.central_storage_strategy_with_two_gpus,
           ],
-          mode=["eager"]
-      ))
+          mode=["eager"]))
   def testMakeDistributedValueExplicitDevicePlacement(self, distribution):
     if not tf2.enabled():
       self.skipTest("Only V2 is supported.")
@@ -384,6 +383,16 @@ def _make_mirrored():
   return mirrored
 
 
+def mirrored_and_tpu_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.tpu_strategy_packed_var,
+      ],
+      mode=["graph", "eager"])
+
+
 class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
 
   def _is_per_replica(self, result, expected, klass=values.PerReplica):
@@ -563,6 +572,7 @@ class RegroupAndSelectDeviceTest(test.TestCase, parameterized.TestCase):
             strategy_combinations.mirrored_strategy_with_one_cpu,
             strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
             strategy_combinations.tpu_strategy,
+            strategy_combinations.tpu_strategy_packed_var,
             strategy_combinations.central_storage_strategy_with_two_gpus,
         ],
         synchronization=[
@@ -708,29 +718,40 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
         self.evaluate(
             distribution.experimental_local_results(distribution.run(assign)))
 
-  def testPackedVariable(self, distribution, synchronization, aggregation):
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.mirrored_strategy_with_one_cpu,
+            strategy_combinations.tpu_strategy,
+        ],
+        mode=["eager"]))
+class PackedDistributedVariableTest(test.TestCase, parameterized.TestCase):
+
+  def testPackedVariable(self, distribution):
     with distribution.scope():
-      v0 = variables_lib.Variable(
-          0., synchronization=synchronization, aggregation=aggregation)
-      if not isinstance(v0, values.DistributedVariable):
-        self.skipTest("This test doesn't apply to non DistributedVariables")
-
-    self.assertEqual(v0._packed_var, None)
-
-    device_type = device.DeviceSpec.from_string(v0._devices[0]).device_type
-    for d in v0._devices:
-      if device.DeviceSpec.from_string(d).device_type != device_type:
-        self.skipTest("Packing variables on devices of different types "
-                      "is not supported yet.")
+      v0 = variables_lib.Variable(0.)
+    self.assertIsNone(v0._packed_var)
 
     distribution._enable_packed_variable_in_eager_mode = True
     with distribution.scope():
-      v1 = variables_lib.Variable(
-          0., synchronization=synchronization, aggregation=aggregation)
-    if ops.executing_eagerly_outside_functions():
+      v1 = variables_lib.Variable(0)
       self.assertIsInstance(v1._packed_var, packed.PackedDistributedVariable)
-    else:
-      self.assertEqual(v1._packed_var, None)
+
+    devices = v1._devices
+    for i in range(1, len(devices)):
+      with distribute_lib.ReplicaContext(distribution, i):
+        v1.assign(i)
+    val = v1._get()
+    self.assertIsInstance(val, packed.PackedVarAndDevice)
+    self.assertEqual(val.device, devices[0])
+    self.assertEqual(self.evaluate(val.read_value()), 0)
+    for i in range(0, len(devices)):
+      with distribute_lib.ReplicaContext(distribution, i):
+        val = v1._get()
+        self.assertIsInstance(val, packed.PackedVarAndDevice)
+        self.assertEqual(val.device, devices[i])
+        self.assertEqual(self.evaluate(val.read_value()), i)
 
 
 class MirroredVariableTest(test.TestCase, parameterized.TestCase):
@@ -920,6 +941,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testAssignValueInReplicaContextWithoutAggregation(self, distribution):
@@ -943,6 +965,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"]))
   def testValueInReplicaContext(self, distribution):
@@ -968,6 +991,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"]))
   def testAssignOutOfScope(self, distribution):
@@ -1041,6 +1065,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testInitializedToSameValueInsideEagerRun(self, distribution):
@@ -1066,6 +1091,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
               strategy_combinations.mirrored_strategy_with_one_cpu,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"]))
   def testAggregationOnlyFirstReplica(self, distribution):
@@ -1093,6 +1119,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["eager"]))
   def testInitScope(self, distribution):
@@ -1143,13 +1170,7 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
         distribution.experimental_local_results(distribution.run(add)))
     self.assertAllEqual([2, 2], per_replica_results)
 
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.tpu_strategy,
-          ],
-          mode=["graph", "eager"]))
+  @combinations.generate(mirrored_and_tpu_strategy_combinations())
   def testAssignAdd(self, distribution):
     with distribution.scope():
       v = variable_scope.variable(
@@ -1456,15 +1477,6 @@ class SyncOnReadVariablePropertiesTest(test.TestCase):
     self.assertEqual(2., self.evaluate(add1(replica_local)))
 
 
-def mirrored_and_tpu_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          strategy_combinations.tpu_strategy,
-      ],
-      mode=["graph", "eager"])
-
-
 # TODO(b/144432582): Add variable aggregation type to combinations to simplify
 # tests.
 def strategy_and_run_tf_function_combinations():
@@ -1478,6 +1490,7 @@ def strategy_and_run_tf_function_combinations():
       experimental_run_tf_function=[True, False]) + combinations.combine(
           distribution=[
               strategy_combinations.tpu_strategy,
+              strategy_combinations.tpu_strategy_packed_var,
           ],
           mode=["graph", "eager"],
           experimental_run_tf_function=[True])
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
index ddb0d2d0401..5909bdd229e 100644
--- a/tensorflow/python/distribute/values_util.py
+++ b/tensorflow/python/distribute/values_util.py
@@ -61,8 +61,14 @@ def on_write_assign_sub(var, value, use_locking=False, name=None,
 
 
 def assign_on_each_device(var, assign_func, value, read_value):
-  update = control_flow_ops.group(
-      tuple(assign_func(v.device, v, value) for v in var._values))  # pylint: disable=protected-access
+  """Update the variable on each replica with the given assign_func and value."""
+  if var._packed_variable is not None:  # pylint: disable=protected-access
+    update = control_flow_ops.group(
+        tuple(
+            assign_func(d, var._packed_variable, value) for d in var._devices))  # pylint: disable=protected-access
+  else:
+    update = control_flow_ops.group(
+        tuple(assign_func(v.device, v, value) for v in var._values))  # pylint: disable=protected-access
   if not read_value:
     return update
   with ops.control_dependencies([update] if update else []):
@@ -104,7 +110,7 @@ def on_read_assign_cross_replica(var, value, read_value=True):
       # TODO(anjs): Should this be over all the replicas in sync since we
       # call `reduce` on the variable during read?
       if var.aggregation == vs.VariableAggregation.SUM:
-        tensor = math_ops.cast(tensor / len(var._values), var.dtype)  # pylint: disable=protected-access
+        tensor = math_ops.cast(tensor / len(var._devices), var.dtype)  # pylint: disable=protected-access
       return assign_on_each_device(var, assign_on_device, tensor,
                                    read_value)
 
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index ce3aaa8a058..6f5f0bc26c2 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -298,7 +298,8 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._pivot = pivot
     self._replicated_vars = {}
 
-  def get_replicated_var_handle(self, name, vars_, is_mirrored=False):
+  def get_replicated_var_handle(self, name, vars_, is_mirrored=False,
+                                is_packed=False):
     """Returns a variable handle for replicated TPU variable 'var'.
 
     This is a method used by an experimental replicated variable implementation
@@ -309,6 +310,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       vars_: The replicated TPU variables.
       is_mirrored: Whether the variables are mirrored, which guarantees the
         values in each replica are always the same.
+      is_packed: Whether the replicated variables are packed into one variable.
 
     Returns:
       The handle of the TPU replicated input node.
@@ -320,7 +322,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     if handle is not None:
       return handle
 
-    if device_assignment is not None:
+    if device_assignment is not None and not is_packed:
       # Find a variable copy for each replica in the device assignment.
       # Note that the order of devices for replicas for the variable and the
       # device assignment might not match.
@@ -356,7 +358,8 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       graph._set_control_flow_context(self.outer_context)
       handle = tpu_ops.tpu_replicated_input([v.handle for v in replicated_vars],
                                             name=name + "/handle",
-                                            is_mirrored_variable=is_mirrored)
+                                            is_mirrored_variable=is_mirrored,
+                                            is_packed=is_packed)
       graph._set_control_flow_context(saved_context)
       # pylint: enable=protected-access
     self._replicated_vars[name] = handle

From 62082d40720b56974436dd17625c247a5fce2a6b Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Thu, 18 Jun 2020 20:16:19 -0700
Subject: [PATCH 086/112] Add build flags for objc libraries

PiperOrigin-RevId: 317235962
Change-Id: I976ccd1ce3db49be3acac44f60b2dc44ed25d767
---
 tensorflow/lite/experimental/objc/BUILD.apple | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index ff7e8fa58e9..09d4547813a 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -97,7 +97,7 @@ objc_library(
         "//tensorflow/lite:testdata/add.bin",
         "//tensorflow/lite:testdata/add_quantized.bin",
     ],
-    tags = TFL_DEFAULT_TAGS,
+    tags = TFL_DEFAULT_TAGS + ["builder_default_ios_x86_64"],
     deps = [
         ":TensorFlowLite",
     ],
@@ -135,7 +135,10 @@ objc_library(
         "apis",
     ],
     module_name = "TestApp",
-    tags = TFL_DEFAULT_TAGS + ["manual"],
+    tags = TFL_DEFAULT_TAGS + [
+        "manual",
+        "builder_default_ios_x86_64",
+    ],
     deps = [
         ":TensorFlowLite",
     ],

From 85ad8031f60536361de71dd689c9d88848fefed6 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Thu, 18 Jun 2020 20:27:03 -0700
Subject: [PATCH 087/112] Expand dtype support for Neg

PiperOrigin-RevId: 317237033
Change-Id: I59c5e45d469f7bf704976b66bc122aaac3982b5e
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |  4 +--
 tensorflow/core/kernels/BUILD                 |  3 ++-
 .../core/kernels/cwise_op_gpu_neg.cu.cc       |  4 +--
 .../{cwise_op_neg.cc => cwise_op_neg_1.cc}    |  6 ++---
 tensorflow/core/kernels/cwise_op_neg_2.cc     | 26 +++++++++++++++++++
 tensorflow/core/ops/math_ops.cc               | 12 ++++-----
 .../kernel_tests/cwise_ops_unary_test.py      |  6 +++++
 7 files changed, 46 insertions(+), 15 deletions(-)
 rename tensorflow/core/kernels/{cwise_op_neg.cc => cwise_op_neg_1.cc} (87%)
 create mode 100644 tensorflow/core/kernels/cwise_op_neg_2.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index dcd083fc398..3b1f3eec699 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -6059,11 +6059,11 @@ I.e., \\(y = -x\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index ffe2a035591..279dff92c58 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6802,7 +6802,8 @@ filegroup(
         "cwise_op_minimum.cc",
         "cwise_op_mul_1.cc",
         "cwise_op_mul_2.cc",
-        "cwise_op_neg.cc",
+        "cwise_op_neg_1.cc",
+        "cwise_op_neg_2.cc",
         "cwise_op_pow.cc",
         "cwise_op_real.cc",
         "cwise_op_reciprocal.cc",
diff --git a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
index ea1ca623560..4f7bb9b2075 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_neg.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_UNARY7(neg, Eigen::half, float, double, int32, int64, complex64,
-              complex128);
+DEFINE_UNARY4(neg, int8, int16, int32, int64);
+DEFINE_UNARY6(neg, Eigen::half, float, double, bfloat16, complex64, complex128);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg_1.cc
similarity index 87%
rename from tensorflow/core/kernels/cwise_op_neg.cc
rename to tensorflow/core/kernels/cwise_op_neg_1.cc
index f52cf6c8b91..18a7c61be90 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg_1.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32,
-          complex64, int64, complex128, bfloat16);
+REGISTER4(UnaryOp, CPU, "Neg", functor::neg, int8, int16, int32, int64);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER3(UnaryOp, SYCL, "Neg", functor::neg, float, double, int64);
@@ -30,8 +29,7 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER6(UnaryOp, GPU, "Neg", functor::neg, float, Eigen::half, double, int64,
-          complex64, complex128);
+REGISTER3(UnaryOp, GPU, "Neg", functor::neg, int8, int16, int64);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/cwise_op_neg_2.cc b/tensorflow/core/kernels/cwise_op_neg_2.cc
new file mode 100644
index 00000000000..5ea78ad665c
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_neg_2.cc
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER6(UnaryOp, CPU, "Neg", functor::neg, Eigen::half, float, double,
+          bfloat16, complex64, complex128);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+REGISTER6(UnaryOp, GPU, "Neg", functor::neg, Eigen::half, float, double,
+          bfloat16, complex64, complex128);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index b81bb9d3afc..2a70f420260 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -201,12 +201,12 @@ REGISTER_OP("ComplexAbs")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // Declares cwise unary operations signature: 't -> 't
-#define UNARY()                                                          \
-  Input("x: T")                                                          \
-      .Output("y: T")                                                    \
-      .Attr(                                                             \
-          "T: {bfloat16, half, float, double, int32, int64, complex64, " \
-          "complex128}")                                                 \
+#define UNARY()                                                            \
+  Input("x: T")                                                            \
+      .Output("y: T")                                                      \
+      .Attr(                                                               \
+          "T: {bfloat16, half, float, double, int8, int16, int32, int64, " \
+          "complex64, complex128}")                                        \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_REAL()                              \
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index f4beaabc29a..df848a653d4 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -389,16 +389,22 @@ class UnaryOpTest(test.TestCase):
                   2).reshape(1, 3, 2).astype(dtypes_lib.bfloat16.as_numpy_dtype)
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
 
   def testInt8Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int8)
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
 
   def testInt16Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int16)
     self._compareCpu(x, np.abs, math_ops.abs)
     self._compareCpu(x, np.abs, _ABS)
+    self._compareBoth(x, np.negative, math_ops.negative)
+    self._compareBoth(x, np.negative, _NEG)
 
   def testInt32Basic(self):
     x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int32)

From 2a05589bd4f5e3042d1baf539e564d7ab9bd6287 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Thu, 18 Jun 2020 20:48:49 -0700
Subject: [PATCH 088/112] Add inference instruction for iOS

PiperOrigin-RevId: 317239235
Change-Id: I55bd7e43bc286f34024ccfc27db61d28304a651d
---
 tensorflow/lite/g3doc/guide/inference.md | 425 +++++++++++++++--------
 1 file changed, 287 insertions(+), 138 deletions(-)

diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index 5f3fba98cff..6e47d6d5190 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -7,9 +7,9 @@ inference with a TensorFlow Lite model, you must run it through an
 The interpreter uses a static graph ordering and a custom (less-dynamic) memory
 allocator to ensure minimal load, initialization, and execution latency.
 
-This page describes how to access to the TensorFlow Lite interpreter and
-perform an inference using C++, Java, and Python, plus links to other resources
-for each [supported platform](#supported-platforms).
+This page describes how to access to the TensorFlow Lite interpreter and perform
+an inference using C++, Java, and Python, plus links to other resources for each
+[supported platform](#supported-platforms).
 
 [TOC]
 
@@ -17,31 +17,31 @@ for each [supported platform](#supported-platforms).
 
 TensorFlow Lite inference typically follows the following steps:
 
-1. **Loading a model**
+1.  **Loading a model**
 
-   You must load the `.tflite` model into memory, which contains the model's
-   execution graph.
+    You must load the `.tflite` model into memory, which contains the model's
+    execution graph.
 
-1. **Transforming data**
+1.  **Transforming data**
 
-   Raw input data for the model generally does not match the input data format
-   expected by the model. For example, you might need to resize an image or
-   change the image format to be compatible with the model.
+    Raw input data for the model generally does not match the input data format
+    expected by the model. For example, you might need to resize an image or
+    change the image format to be compatible with the model.
 
-1. **Running inference**
+1.  **Running inference**
 
-   This step involves using the TensorFlow Lite API to execute the model. It
-   involves a few steps such as building the interpreter, and allocating
-   tensors, as described in the following sections.
+    This step involves using the TensorFlow Lite API to execute the model. It
+    involves a few steps such as building the interpreter, and allocating
+    tensors, as described in the following sections.
 
-1. **Interpreting output**
+1.  **Interpreting output**
 
-   When you receive results from the model inference, you must interpret the
-   tensors in a meaningful way that's useful in your application.
+    When you receive results from the model inference, you must interpret the
+    tensors in a meaningful way that's useful in your application.
 
-   For example, a model might return only a list of probabilities. It's up to
-   you to map the probabilities to relevant categories and present it to your
-   end-user.
+    For example, a model might return only a list of probabilities. It's up to
+    you to map the probabilities to relevant categories and present it to your
+    end-user.
 
 ## Supported platforms
 
@@ -54,8 +54,8 @@ should be no surprise that the APIs try to avoid unnecessary copies at the
 expense of convenience. Similarly, consistency with TensorFlow APIs was not an
 explicit goal and some variance between languages is to be expected.
 
-Across all libraries, the TensorFlow Lite API enables you to load models,
-feed inputs, and retrieve inference outputs.
+Across all libraries, the TensorFlow Lite API enables you to load models, feed
+inputs, and retrieve inference outputs.
 
 ### Android
 
@@ -64,8 +64,8 @@ APIs. The Java APIs provide convenience and can be used directly within your
 Android Activity classes. The C++ APIs offer more flexibility and speed, but may
 require writing JNI wrappers to move data between Java and C++ layers.
 
-See below for details about using C++ and Java, or
-follow the [Android quickstart](android.md) for a tutorial and example code.
+See below for details about using C++ and Java, or follow the
+[Android quickstart](android.md) for a tutorial and example code.
 
 #### TensorFlow Lite Android wrapper code generator
 
@@ -86,103 +86,36 @@ On iOS, TensorFlow Lite is available with native iOS libraries written in
 [Swift](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift)
 and
 [Objective-C](https://www.tensorflow.org/code/tensorflow/lite/experimental/objc).
+You can also use
+[C API](https://www.tensorflow.org/code/tensorflow/lite/c/c_api.h)
+directly in Objective-C codes.
 
-This page doesn't include a discussion for about these languages, so you should
-refer to the [iOS quickstart](ios.md) for a tutorial and example code.
+See below for details about using Swift, Objective-C and C API, or follow the
+[iOS quickstart](ios.md) for a tutorial and example code.
 
 ### Linux
 
 On Linux platforms (including [Raspberry Pi](build_rpi.md)), you can run
-inferences using TensorFlow Lite APIs available in C++ and Python, as shown
-in the following sections.
+inferences using TensorFlow Lite APIs available in C++ and Python, as shown in
+the following sections.
 
+## Running a model
 
-## Load and run a model in C++
+Running a TensorFlow Lite model involves a few simple steps:
 
-Running a TensorFlow Lite model with C++ involves a few simple steps:
-
-  1. Load the model into memory as a `FlatBufferModel`.
-  2. Build an `Interpreter` based on an existing `FlatBufferModel`.
-  3. Set input tensor values. (Optionally resize input tensors if the
-     predefined sizes are not desired.)
-  4. Invoke inference.
-  5. Read output tensor values.
-
-The [`FlatBufferModel`](
-https://www.tensorflow.org/lite/api_docs/cc/class/tflite/flat-buffer-model.html)
-class encapsulates a TensorFlow Lite model and you can
-build it in a couple of different ways, depending on where the model is stored:
-
-```c++
-class FlatBufferModel {
-  // Build a model based on a file. Return a nullptr in case of failure.
-  static std::unique_ptr<FlatBufferModel> BuildFromFile(
-      const char* filename,
-      ErrorReporter* error_reporter);
-
-  // Build a model based on a pre-loaded flatbuffer. The caller retains
-  // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Return a nullptr in case of failure.
-  static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
-      const char* buffer,
-      size_t buffer_size,
-      ErrorReporter* error_reporter);
-};
-```
-
-Note: If TensorFlow Lite detects the presence of the [Android NNAPI](
-https://developer.android.com/ndk/guides/neuralnetworks), it will
-automatically try to use shared memory to store the `FlatBufferModel`.
-
-Now that you have the model as a `FlatBufferModel` object, you can execute it
-with an [`Interpreter`](
-https://www.tensorflow.org/lite/api_docs/cc/class/tflite/interpreter.html).
-A single `FlatBufferModel` can be used
-simultaneously by more than one `Interpreter`.
-
-Caution: The `FlatBufferModel` object must remain valid until
-all instances of `Interpreter` using it have been destroyed.
-
-The important parts of the `Interpreter` API are shown in the
-code snippet below. It should be noted that:
-
-  * Tensors are represented by integers, in order to avoid string comparisons
-    (and any fixed dependency on string libraries).
-  * An interpreter must not be accessed from concurrent threads.
-  * Memory allocation for input and output tensors must be triggered
-    by calling `AllocateTensors()` right after resizing tensors.
-
-The simplest usage of TensorFlow Lite with C++ looks like this:
-
-```c++
-// Load the model
-std::unique_ptr<tflite::FlatBufferModel> model =
-    tflite::FlatBufferModel::BuildFromFile(filename);
-
-// Build the interpreter
-tflite::ops::builtin::BuiltinOpResolver resolver;
-std::unique_ptr<tflite::Interpreter> interpreter;
-tflite::InterpreterBuilder(*model, resolver)(&interpreter);
-
-// Resize input tensors, if desired.
-interpreter->AllocateTensors();
-
-float* input = interpreter->typed_input_tensor<float>(0);
-// Fill `input`.
-
-interpreter->Invoke();
-
-float* output = interpreter->typed_output_tensor<float>(0);
-```
-
-For more example code, see [`minimal.cc`](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/minimal/minimal.cc)
-and [`label_image.cc`](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/label_image/label_image.cc).
+1.  Load the model into memory.
+2.  Build an `Interpreter` based on an existing model.
+3.  Set input tensor values. (Optionally resize input tensors if the predefined
+    sizes are not desired.)
+4.  Invoke inference.
+5.  Read output tensor values.
 
+Following sections describe how these steps can be done in each language.
 
 ## Load and run a model in Java
 
+*Platform: Android*
+
 The Java API for running an inference with TensorFlow Lite is primarily designed
 for use with Android, so it's available as an Android library dependency:
 `org.tensorflow:tensorflow-lite`.
@@ -203,12 +136,12 @@ public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer);
 ```
 
 In both cases, you must provide a valid TensorFlow Lite model or the API throws
-`IllegalArgumentException`. If you use `MappedByteBuffer` to
-initialize an `Interpreter`, it must remain unchanged for the whole lifetime
-of the `Interpreter`.
+`IllegalArgumentException`. If you use `MappedByteBuffer` to initialize an
+`Interpreter`, it must remain unchanged for the whole lifetime of the
+`Interpreter`.
 
-To then run an inference with the model, simply call `Interpreter.run()`.
-For example:
+To then run an inference with the model, simply call `Interpreter.run()`. For
+example:
 
 ```java
 try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
@@ -228,9 +161,9 @@ In this case, each entry in `inputs` corresponds to an input tensor and
 output data.
 
 In both cases, the tensor indices should correspond to the values you gave to
-the [TensorFlow Lite Converter](../convert/) when you created the model.
-Be aware that the order of tensors in `input` must match the
-order given to the TensorFlow Lite Converter.
+the [TensorFlow Lite Converter](../convert/) when you created the model. Be
+aware that the order of tensors in `input` must match the order given to the
+TensorFlow Lite Converter.
 
 The `Interpreter` class also provides convenient functions for you to get the
 index of any model input or output using an operation name:
@@ -250,8 +183,8 @@ resources must be released after use by:
 interpreter.close();
 ```
 
-For an example project with Java, see the [Android image classification sample](
-https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android).
+For an example project with Java, see the
+[Android image classification sample](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android).
 
 ### Supported data types (in Java)
 
@@ -295,13 +228,231 @@ have dynamic outputs, where the shape of output tensors can vary depending on
 the input. There's no straightforward way of handling this with the existing
 Java inference API, but planned extensions will make this possible.
 
+## Load and run a model in Swift
+
+*Platform: iOS*
+
+The
+[Swift API](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift)
+is available in `TensorFlowLiteSwift` Pod from Cocoapods.
+
+First, you need to import `TensorFlowLite` module.
+
+```swift
+import TensorFlowLite
+```
+
+```swift
+// Getting model path
+guard
+  let modelPath = Bundle.main.path(forResource: "model", ofType: "tflite")
+else {
+  // Error handling...
+}
+
+do {
+  // Initialize an interpreter with the model.
+  let interpreter = try Interpreter(modelPath: modelPath)
+
+  // Allocate memory for the model's input `Tensor`s.
+  try interpreter.allocateTensors()
+
+  let inputData: Data  // Should be initialized
+
+  // input data preparation...
+
+  // Copy the input data to the input `Tensor`.
+  try self.interpreter.copy(inputData, toInputAt: 0)
+
+  // Run inference by invoking the `Interpreter`.
+  try self.interpreter.invoke()
+
+  // Get the output `Tensor`
+  let outputTensor = try self.interpreter.output(at: 0)
+
+  // Copy output to `Data` to process the inference results.
+  let outputSize = outputTensor.shape.dimensions.reduce(1, {x, y in x * y})
+  let outputData =
+        UnsafeMutableBufferPointer<Float32>.allocate(capacity: outputSize)
+  outputTensor.data.copyBytes(to: outputData)
+
+  if (error != nil) { /* Error handling... */ }
+} catch error {
+  // Error handling...
+}
+```
+
+## Load and run a model in Objective-C
+
+*Platform: iOS*
+
+The
+[Objective-C API](https://www.tensorflow.org/code/tensorflow/lite/experimental/objc)
+is available in `TensorFlowLiteObjC` Pod from Cocoapods.
+
+First, you need to import `TensorFlowLite` module.
+
+```objc
+@import TensorFlowLite;
+```
+
+```objc
+NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"model"
+                                                      ofType:@"tflite"];
+NSError *error;
+
+// Initialize an interpreter with the model.
+TFLInterpreter *interpreter = [[TFLInterpreter alloc] initWithModelPath:modelPath
+                                                                  error:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Allocate memory for the model's input `TFLTensor`s.
+[interpreter allocateTensorsWithError:&error];
+if (error != nil) { /* Error handling... */ }
+
+NSMutableData *inputData;  // Should be initialized
+// input data preparation...
+
+// Copy the input data to the input `TFLTensor`.
+[interpreter copyData:inputData toInputTensorAtIndex:0 error:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Run inference by invoking the `TFLInterpreter`.
+[interpreter invokeWithError:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Get the output `TFLTensor`
+TFLTensor *outputTensor = [interpreter outputTensorAtIndex:0 error:&error];
+if (error != nil) { /* Error handling... */ }
+
+// Copy output to `NSData` to process the inference results.
+NSData *outputData = [outputTensor dataWithError:&amp;error];
+if (error != nil) { /* Error handling... */ }
+```
+
+### Using C API in Objective-C code
+
+Currently Objective-C API does not support delegates. In order to use delegates
+with Objective-C code, you need to directly call underlying
+[C API](https://www.tensorflow.org/code/tensorflow/lite/c/c_api.h).
+
+```c
+#include "tensorflow/lite/c/c_api.h"
+```
+
+```c
+TfLiteModel* model = TfLiteModelCreateFromFile([modelPath UTF8String]);
+TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+
+// Create the interpreter.
+TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+// Allocate tensors and populate the input tensor data.
+TfLiteInterpreterAllocateTensors(interpreter);
+TfLiteTensor* input_tensor =
+    TfLiteInterpreterGetInputTensor(interpreter, 0);
+TfLiteTensorCopyFromBuffer(input_tensor, input.data(),
+                           input.size() * sizeof(float));
+
+// Execute inference.
+TfLiteInterpreterInvoke(interpreter);
+
+// Extract the output tensor data.
+const TfLiteTensor* output_tensor =
+//      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+TfLiteTensorCopyToBuffer(output_tensor, output.data(),
+                         output.size() * sizeof(float));
+
+// Dispose of the model and interpreter objects.
+TfLiteInterpreterDelete(interpreter);
+TfLiteInterpreterOptionsDelete(options);
+TfLiteModelDelete(model);
+```
+
+## Load and run a model in C++
+
+*Platforms: Android and Linux*
+
+In C++, the model is stored in
+[`FlatBufferModel`](https://www.tensorflow.org/lite/api_docs/cc/class/tflite/flat-buffer-model.html)
+class. It encapsulates a TensorFlow Lite model and you can build it in a couple
+of different ways, depending on where the model is stored:
+
+```c++
+class FlatBufferModel {
+  // Build a model based on a file. Return a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromFile(
+      const char* filename,
+      ErrorReporter* error_reporter);
+
+  // Build a model based on a pre-loaded flatbuffer. The caller retains
+  // ownership of the buffer and should keep it alive until the returned object
+  // is destroyed. Return a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
+      const char* buffer,
+      size_t buffer_size,
+      ErrorReporter* error_reporter);
+};
+```
+
+Note: If TensorFlow Lite detects the presence of the
+[Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks), it
+will automatically try to use shared memory to store the `FlatBufferModel`.
+
+Now that you have the model as a `FlatBufferModel` object, you can execute it
+with an
+[`Interpreter`](https://www.tensorflow.org/lite/api_docs/cc/class/tflite/interpreter.html).
+A single `FlatBufferModel` can be used simultaneously by more than one
+`Interpreter`.
+
+Caution: The `FlatBufferModel` object must remain valid until all instances of
+`Interpreter` using it have been destroyed.
+
+The important parts of the `Interpreter` API are shown in the code snippet
+below. It should be noted that:
+
+*   Tensors are represented by integers, in order to avoid string comparisons
+    (and any fixed dependency on string libraries).
+*   An interpreter must not be accessed from concurrent threads.
+*   Memory allocation for input and output tensors must be triggered by calling
+    `AllocateTensors()` right after resizing tensors.
+
+The simplest usage of TensorFlow Lite with C++ looks like this:
+
+```c++
+// Load the model
+std::unique_ptr<tflite::FlatBufferModel> model =
+    tflite::FlatBufferModel::BuildFromFile(filename);
+
+// Build the interpreter
+tflite::ops::builtin::BuiltinOpResolver resolver;
+std::unique_ptr<tflite::Interpreter> interpreter;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+
+// Resize input tensors, if desired.
+interpreter->AllocateTensors();
+
+float* input = interpreter->typed_input_tensor<float>(0);
+// Fill `input`.
+
+interpreter->Invoke();
+
+float* output = interpreter->typed_output_tensor<float>(0);
+```
+
+For more example code, see
+[`minimal.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/minimal/minimal.cc)
+and
+[`label_image.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/label_image/label_image.cc).
 
 ## Load and run a model in Python
 
-The Python API for running an inference is provided in the `tf.lite`
-module. From which, you mostly need only [`tf.lite.Interpreter`](
-https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter) to load
-a model and run an inference.
+*Platform: Linux*
+
+The Python API for running an inference is provided in the `tf.lite` module.
+From which, you mostly need only
+[`tf.lite.Interpreter`](https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter)
+to load a model and run an inference.
 
 The following example shows how to use the Python interpreter to load a
 `.tflite` file and run inference with random input data:
@@ -358,13 +509,12 @@ interpreter.allocate_tensors()
 # Continue to get tensors and so forth, as shown above...
 ```
 
-For more Python sample code, see [`label_image.py`](
-https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/python/label_image.py).
+For more Python sample code, see
+[`label_image.py`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/python/label_image.py).
 
 Tip: Run `help(tf.lite.Interpreter)` in the Python terminal to get detailed
 documentation about the interpreter.
 
-
 ## Write a custom operator
 
 All TensorFlow Lite operators (both custom and builtin) are defined using a
@@ -379,10 +529,10 @@ typedef struct {
 } TfLiteRegistration;
 ```
 
-Refer to `context.h` for details on `TfLiteContext` and `TfLiteNode`. The
-former provides error reporting facilities and access to global objects,
-including all the tensors. The latter allows implementations to access their
-inputs and outputs.
+Refer to `context.h` for details on `TfLiteContext` and `TfLiteNode`. The former
+provides error reporting facilities and access to global objects, including all
+the tensors. The latter allows implementations to access their inputs and
+outputs.
 
 When the interpreter loads a model, it calls `init()` once for each node in the
 graph. A given `init()` will be called more than once if the op is used multiple
@@ -403,9 +553,9 @@ implementations can access their state using `node->user_data`.
 Finally, each time inference runs, the interpreter traverses the graph calling
 `invoke()`, and here too the state is available as `node->user_data`.
 
-Custom ops can be implemented in exactly the same way as builtin ops, by
-defined those four functions and a global registration function that usually
-looks like this:
+Custom ops can be implemented in exactly the same way as builtin ops, by defined
+those four functions and a global registration function that usually looks like
+this:
 
 ```c++
 namespace tflite {
@@ -461,8 +611,7 @@ You can optionally register custom ops (before you pass the resolver to the
 resolver.AddOp("MY_CUSTOM_OP", Register_MY_CUSTOM_OP());
 ```
 
-If the set of builtin ops is deemed to be too large, a new `OpResolver` could
-be code-generated  based on a given subset of ops, possibly only the ones
-contained in a given model. This is the equivalent of TensorFlow's selective
-registration (and a simple version of it is available in the `tools`
-directory).
+If the set of builtin ops is deemed to be too large, a new `OpResolver` could be
+code-generated based on a given subset of ops, possibly only the ones contained
+in a given model. This is the equivalent of TensorFlow's selective registration
+(and a simple version of it is available in the `tools` directory).

From 397494a2313aa51fe0b87b4e51d3a2349e4f8ecc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 20:51:42 -0700
Subject: [PATCH 089/112] Hoisting unconditional converts from conditional
 branch computations.

PiperOrigin-RevId: 317239618
Change-Id: If3b16ff4f2bbcf38ee1ca51f5e8b187c58ab8e91
---
 .../xla/service/conditional_code_motion.cc    | 285 ++++++++++++++++--
 .../xla/service/conditional_code_motion.h     |  15 +-
 .../service/conditional_code_motion_test.cc   | 101 ++++++-
 3 files changed, 369 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
index eecdcc851e9..6db4c3eb6d4 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -106,7 +106,6 @@ class BranchVisitor {
         boundaries_.emplace_back(operand, i, inst);
         continue;
       }
-
       worklist_.push_back(operand);
       visited_.insert(operand);
     }
@@ -197,6 +196,7 @@ bool WorthHoisting(HloInstruction* instruction) {
     case HloOpcode::kMultiply:
     case HloOpcode::kDivide:
     case HloOpcode::kTuple:
+    case HloOpcode::kSqrt:
     case HloOpcode::kGetTupleElement:
       return true;
     default:
@@ -206,10 +206,11 @@ bool WorthHoisting(HloInstruction* instruction) {
 
 // Compare if the instructions to be visited at each branches are identical.
 bool InstructionWithinBranchIdentical(
-    const std::vector<HloInstruction*>& instructions, bool is_layout_senstive) {
+    const std::vector<HloInstruction*>& instructions,
+    bool is_layout_sensitive) {
   // Identical includes the shape of each operands are equal.
   auto eq_operand = [&](const HloInstruction* a, const HloInstruction* b) {
-    bool eq_operands = is_layout_senstive
+    bool eq_operands = is_layout_sensitive
                            ? ShapeUtil::Equal(a->shape(), b->shape())
                            : ShapeUtil::Compatible(a->shape(), b->shape());
     return eq_operands;
@@ -233,7 +234,7 @@ bool InstructionWithinBranchIdentical(
           auto old_channel_id = instruction->channel_id();
           instruction->set_channel_id(instructions[0]->channel_id());
           bool eq_instructions = instructions[0]->Identical(
-              *instruction, eq_operand, eq_computations, is_layout_senstive);
+              *instruction, eq_operand, eq_computations, is_layout_sensitive);
           instruction->set_channel_id(old_channel_id);
           return eq_instructions;
         });
@@ -243,7 +244,7 @@ bool InstructionWithinBranchIdentical(
                      [&](HloInstruction* instruction) {
                        return instructions[0]->Identical(
                            *instruction, eq_operand, eq_computations,
-                           is_layout_senstive);
+                           is_layout_sensitive);
                      });
 }
 
@@ -354,12 +355,228 @@ Status RemoveInstructionFromComputation(
   return Status::OK();
 }
 
+// Identify converts to be hoisted/rematerialized out of the branch
+// computations.
+absl::flat_hash_set<int64> FindSpecialConverts(HloInstruction* old_root,
+                                               int branch_count,
+                                               HloInstruction* conditional,
+                                               bool is_layout_sensitive) {
+  absl::flat_hash_set<int64> kspecial_convert;
+  for (int64 operand_num = 0; operand_num < old_root->operand_count();
+       ++operand_num) {
+    if (old_root->operand(operand_num)->opcode() != HloOpcode::kConvert) {
+      continue;
+    }
+    bool replica = true;
+    HloInstruction* kspecial_convert_candidate =
+        old_root->mutable_operand(operand_num);
+    // Check whether an identical candidate appears in other branches
+    for (int others = 1; others < branch_count; ++others) {
+      HloInstruction* others_root =
+          conditional->branch_computation(others)->root_instruction();
+      bool eq_shape =
+          is_layout_sensitive
+              ? ShapeUtil::Equal(others_root->operand(operand_num)->shape(),
+                                 kspecial_convert_candidate->shape())
+              : ShapeUtil::Compatible(
+                    others_root->operand(operand_num)->shape(),
+                    kspecial_convert_candidate->shape());
+      if ((others_root->operand(operand_num)->opcode() ==
+           HloOpcode::kConvert) &&
+          eq_shape) {
+        // Nothing to be done.
+      } else {
+        replica = false;
+        break;
+      }
+    }
+    if (replica) {
+      kspecial_convert.insert(operand_num);
+    }
+  }
+  return kspecial_convert;
+}
+
+// Restructuring the conditional instruction as follows:
+// i.e., %result = conditional() becomes
+// x = conditional()
+// y.{0..n} = gte(x, {0..n})
+// z = tuple(y.0, y.1, ...y.n)
+// Doing so ensures that we can accommodate the possible shape-change of the
+// conditional when the instructions are hoisted.
+Status RestructureConditionalInstruction(HloComputation* computation,
+                                         HloInstruction* conditional) {
+  HloInstruction* old_root = computation->root_instruction();
+  std::vector<HloInstruction*> new_operands;
+  int cur_index = 0;
+  for (; cur_index < ShapeUtil::TupleElementCount(conditional->shape());
+       ++cur_index) {
+    new_operands.push_back(
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            ShapeUtil::GetTupleElementShape(conditional->shape(), cur_index),
+            conditional, cur_index)));
+  }
+  HloInstruction* new_tuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  if (old_root == conditional) {
+    computation->set_root_instruction(new_tuple);
+  } else {
+    std::vector<HloInstruction*> new_tuple_users;
+    for (auto conditional_user : conditional->users()) {
+      auto is_new_gte = absl::c_find_if(
+          new_operands,
+          [&](HloInstruction* instr) { return instr == conditional_user; });
+      if (is_new_gte == new_operands.end()) {
+        new_tuple_users.push_back(conditional_user);
+      }
+    }
+    for (auto new_tuple_user : new_tuple_users) {
+      TF_RETURN_IF_ERROR(
+          conditional->ReplaceUseWith(new_tuple_user, new_tuple));
+    }
+  }
+  VLOG(2) << "computation after root restructure:\n" << computation->ToString();
+  return Status::OK();
+}
+
+StatusOr<bool> ConvertSpecialMove(HloInstruction* conditional,
+                                  bool is_layout_sensitive) {
+  int branch_count = conditional->branch_count();
+  if (branch_count <= 0) {
+    return false;
+  }
+
+  HloInstruction* old_root =
+      conditional->branch_computation(0)->root_instruction();
+  if (old_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  } else {
+    VLOG(2) << "BEFORE :" << conditional->parent()->parent()->ToString();
+    // Identify the gte using `index'.
+    auto find_gte = [](const HloInstruction* conditional_result,
+                       int64 index) -> HloInstruction* {
+      for (HloInstruction* instr : conditional_result->users()) {
+        if (instr->opcode() != HloOpcode::kGetTupleElement) {
+          return nullptr;
+        }
+        if (instr->tuple_index() == index) {
+          return instr;
+        }
+      }
+      return nullptr;
+    };
+
+    // Captures tuple indices refering to converts to be rematerialized/hoisted.
+    absl::flat_hash_set<int64> kspecial_convert = FindSpecialConverts(
+        old_root, branch_count, conditional, is_layout_sensitive);
+
+    // Exit if we cannot find any converts to be hoisted.
+    if (kspecial_convert.empty()) {
+      return false;
+    }
+
+    TF_RETURN_IF_ERROR(
+        RestructureConditionalInstruction(conditional->parent(), conditional));
+
+    for (int branch = 0; branch < branch_count; branch++) {
+      old_root = conditional->branch_computation(branch)->root_instruction();
+      absl::flat_hash_map<HloInstruction*, int64> map_inst_to_tuple_index;
+      std::vector<HloInstruction*> new_operands(old_root->operand_count());
+      std::unordered_set<HloInstruction*> to_hoist_set;
+
+      for (int64 operand_num = 0; operand_num < old_root->operand_count();
+           ++operand_num) {
+        map_inst_to_tuple_index[old_root->mutable_operand(operand_num)] =
+            operand_num;
+      }
+      for (int64 operand_num = 0; operand_num < old_root->operand_count();
+           ++operand_num) {
+        HloInstruction* hoist = old_root->mutable_operand(operand_num);
+        if (!kspecial_convert.contains(operand_num)) {
+          new_operands[operand_num] = old_root->mutable_operand(operand_num);
+          continue;
+        }
+
+        to_hoist_set.insert(hoist);
+        int64 new_tuple_count = old_root->operand_count();
+
+        // Replace the hoisted instr in the tuple with the operand/operands.
+        // We will replace at least one of the operands of the hoist at the
+        // tuple place; the rest will be added at the end.
+        bool inplace = true;
+        CHECK(!hoist->operands().empty());
+        for (HloInstruction* prod : hoist->operands()) {
+          if (inplace) {
+            map_inst_to_tuple_index[prod] = map_inst_to_tuple_index[hoist];
+            new_operands[map_inst_to_tuple_index[hoist]] = prod;
+            inplace = false;
+          } else {
+            map_inst_to_tuple_index[prod] = new_tuple_count++;
+            new_operands.push_back(prod);
+          }
+        }
+      }
+
+      // Create the new root instruction.
+      HloComputation* cur_branch = conditional->branch_computation(branch);
+      HloInstruction* new_branch_root =
+          cur_branch->AddInstruction(HloInstruction::CreateTuple(new_operands));
+      // The shape can vary since the operands to convert are now
+      // being returned through the branches' root.
+      cur_branch->set_root_instruction(new_branch_root, true /*new shape*/);
+      TF_CHECK_OK(cur_branch->RemoveInstruction(old_root));
+
+      // Only one of the branches needs to change the conditional->parent().
+      if (branch != 0) {
+        continue;
+      }
+      HloComputation* conditional_parent = conditional->parent();
+      HloInstruction* newconditional =
+          conditional_parent->AddInstruction(HloInstruction::CreateConditional(
+              cur_branch->root_instruction()->shape(),
+              conditional->mutable_operand(0),
+              absl::MakeSpan(conditional->branch_computations()),
+              absl::MakeSpan(conditional->operands()).subspan(1)));
+      // Ensure that all the users of conditional refer to the new one.
+      TF_RETURN_IF_ERROR(
+          conditional->ReplaceAllUsesWithDifferentShape(newconditional));
+      TF_CHECK_OK(conditional_parent->RemoveInstruction(conditional));
+      conditional = newconditional;
+      // Add the hoisted instructions in the parent.
+      for (HloInstruction* hoist : to_hoist_set) {
+        VLOG(2) << "Hoisting instruction:" << hoist->ToString();
+        int64 hoist_index = map_inst_to_tuple_index[hoist];
+        // Find out the gte that captured the hoisted instr result.
+        HloInstruction* gte_hoist = find_gte(conditional, hoist_index);
+        CHECK(gte_hoist != nullptr);
+        std::vector<HloInstruction*> new_operands;
+        for (HloInstruction* op : hoist->operands()) {
+          HloInstruction* gte = conditional_parent->AddInstruction(
+              HloInstruction::CreateGetTupleElement(
+                  op->shape(), conditional, map_inst_to_tuple_index[op]));
+          new_operands.push_back(gte);
+        }
+        HloInstruction* hoisted = conditional_parent->AddInstruction(
+            hoist->CloneWithNewOperands(hoist->shape(), new_operands));
+        VLOG(2) << "Hoisted instruction in parent:" << hoisted->ToString();
+        TF_RETURN_IF_ERROR(gte_hoist->ReplaceAllUsesWith(hoisted));
+        TF_CHECK_OK(conditional_parent->RemoveInstruction(gte_hoist));
+      }
+      // No need to explicitly delete a hoisted instruction since if its dead
+      // then the subsequent DCE will remove it.
+    }
+  }
+  VLOG(2) << "AFTER :" << conditional->parent()->parent()->ToString();
+  return true;
+}
+
 // Hoist identical ops out of the conditional. The definition of identical
 // are the shape of the operands are identical and their properties are
 // identical. Will start from the root instruction of each branch and get
 // the identical ops to hoist.
 StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
                                       bool is_layout_sensitive) {
+  VLOG(1) << " visiting conditional:" << conditional->ToString();
   int branch_count = conditional->branch_count();
   if (branch_count <= 0) {
     return false;
@@ -399,7 +616,7 @@ StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
     }
   }
 
-  if (visitors[0].HoistInstructionSize() <= 1) {
+  if (visitors[0].HoistInstructionSize() < 1) {
     return false;
   }
 
@@ -442,7 +659,6 @@ StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
         RemoveInstructionFromComputation(visitors[i].instructions_to_hoist(),
                                          conditional->branch_computation(i)));
   }
-
   return true;
 }
 
@@ -451,26 +667,55 @@ StatusOr<bool> MergeIdenticalElements(HloInstruction* conditional,
 StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
   bool changed = false;
 
-  // Gather all the conditional ops in our module. We do this ahead of time so
-  // we don't have to worry about mutating the lists of computations or
-  // instructions as we iterate.
-  std::vector<HloInstruction*> conditional_ops;
-  for (auto* comp : module->MakeComputationPostOrder()) {
-    for (auto* instr : comp->MakeInstructionPostOrder()) {
-      if (instr->opcode() == HloOpcode::kConditional) {
-        conditional_ops.push_back(instr);
+  if (pursue_full_conditional_code_motion_) {
+    std::vector<HloInstruction*> conditional_ops;
+    for (auto* comp : module->MakeComputationPostOrder()) {
+      for (auto* instr : comp->MakeInstructionPostOrder()) {
+        if (instr->opcode() == HloOpcode::kConditional) {
+          conditional_ops.push_back(instr);
+        }
       }
     }
+
+    for (HloInstruction* conditional_op : conditional_ops) {
+      TF_ASSIGN_OR_RETURN(
+          bool result,
+          MergeIdenticalElements(conditional_op, is_layout_sensitive_));
+      changed |= result;
+    }
+
+    if (changed) {
+      HloPassPipeline subpipeline("after_conditional_code_motion");
+      subpipeline.AddPass<HloDCE>();
+      subpipeline.AddPass<TupleSimplifier>();
+      subpipeline.AddPass<HloDCE>();
+      TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
+      changed |= cleanup_changed;
+    }
   }
 
-  for (HloInstruction* conditional_op : conditional_ops) {
-    TF_ASSIGN_OR_RETURN(bool result, MergeIdenticalElements(
-                                         conditional_op, is_layout_sensitive_));
-    changed |= result;
+  // handling convert rematerialization/hoisting
+  {
+    std::vector<HloInstruction*> conditional_ops;
+    for (auto* comp : module->MakeComputationPostOrder()) {
+      for (auto* instr : comp->MakeInstructionPostOrder()) {
+        if (instr->opcode() == HloOpcode::kConditional) {
+          conditional_ops.push_back(instr);
+        }
+      }
+    }
+    for (HloInstruction* conditional_op : conditional_ops) {
+      TF_ASSIGN_OR_RETURN(
+          bool convert_result,
+          ConvertSpecialMove(conditional_op, is_layout_sensitive_));
+      changed |= convert_result;
+    }
   }
 
   if (changed) {
-    HloPassPipeline subpipeline("after_conditional_code_motion");
+    HloPassPipeline subpipeline(
+        "after_conditional_code_motion_after_convert_hoisting");
+    subpipeline.AddPass<HloDCE>();
     subpipeline.AddPass<TupleSimplifier>();
     subpipeline.AddPass<HloDCE>();
     TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.h b/tensorflow/compiler/xla/service/conditional_code_motion.h
index 1197a8b3620..95f02833e15 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.h
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.h
@@ -23,7 +23,11 @@ limitations under the License.
 
 namespace xla {
 
-// HLO pass that moves identical ops out of conditional.
+// ConditionalCodeMotion specializes in hoisting/rematerializing
+// unconditional converts in the default mode.
+// When pursue_full_conditional_code_motion_ is set to true, the
+// full HLO pass moves identical ops out of a conditional in addition to moving
+// converts.
 // - The definition of identical are the shape of the operands are identical
 // and their properties are identical.
 // - Currently, only some types of instructions is supported.
@@ -35,13 +39,18 @@ class ConditionalCodeMotion : public HloModulePass {
  public:
   // If is_layout_sensitive is true, then the hoist process preserves layout
   // during identical comparison. Otherwise, layout is ignored.
-  explicit ConditionalCodeMotion(bool is_layout_sensitive = true)
-      : is_layout_sensitive_(is_layout_sensitive) {}
+  explicit ConditionalCodeMotion(
+      bool is_layout_sensitive = true,
+      bool pursue_full_conditional_code_motion = false)
+      : is_layout_sensitive_(is_layout_sensitive),
+        pursue_full_conditional_code_motion_(
+            pursue_full_conditional_code_motion) {}
   absl::string_view name() const override { return "conditional-code-motion"; }
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
   const bool is_layout_sensitive_;
+  const bool pursue_full_conditional_code_motion_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
index 4a52303a42a..38b2b515fa0 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
@@ -38,7 +38,86 @@ namespace {
 using ConditionalCodeMotionTest = HloTestBase;
 namespace op = xla::testing::opcode_matchers;
 
-TEST_F(ConditionalCodeMotionTest, DoNotMoveConvertOut) {
+TEST_F(ConditionalCodeMotionTest, MoveSubsetTupleOut) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+on_true {
+  %arg_tuple.1 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %reshape.8493 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.1)
+  %convert.2894 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.8493)
+  ROOT %tuple.1 = ( bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) tuple(%convert.2894, %reshape.8493)
+}
+
+on_false {
+  %arg_tuple.2 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %reshape.9717 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.3)
+  %add = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.9717, f32[2,512,364]{2,1,0} %reshape.9717)
+  %convert.3604 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.9717), metadata={op_type="Cast" op_name="gradients/Cast_125_grad/Cast"}
+  ROOT %tuple.2 = (bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) tuple(%convert.3604, %add)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (f32[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
+  conditional = (bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+  get-first-index = bf16[2,512,364]{2,1,0} get-tuple-element(conditional), index=0
+  get-first-index.2 = f32[2,512,364]{2,1,0} get-tuple-element(conditional), index=1
+  ROOT result = (bf16[2,512,364]{2,1,0}, f32[2,512,364]{2,1,0}) tuple(get-first-index, get-first-index.2)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Convert(), op::GetTupleElement())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveConvertOutConditionalRoot) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+on_true {
+  %arg_tuple.1 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %reshape.8493 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.1)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.8493, f32[2,512,364]{2,1,0} %reshape.8493)
+  %convert.2894 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %add.8493)
+  ROOT %tuple.1 = ( bf16[2,512,364]{2,1,0}) tuple(%convert.2894)
+}
+
+on_false {
+  %arg_tuple.2 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %reshape.9717 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.3)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.9717, f32[2,512,364]{2,1,0} %reshape.9717)
+  %sub.8493 = f32[2,512,364]{2,1,0} subtract(f32[2,512,364]{2,1,0} %add.8493, f32[2,512,364]{2,1,0} %reshape.9717)
+  %convert.3604 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.9717), metadata={op_type="Cast" op_name="gradients/Cast_125_grad/Cast"}
+  ROOT %tuple.2 = (bf16[2,512,364]{2,1,0}) tuple(%convert.3604)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (f32[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
+  ROOT conditional = (bf16[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Convert())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveConvertOut) {
   absl::string_view hlo_string =
       R"(
 HloModule RemoveDotOpOut
@@ -65,12 +144,16 @@ ENTRY main {
   arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
   conditional = (bf16[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
   get-first-index = bf16[2,512,364]{2,1,0} get-tuple-element(conditional), index=0
-  ROOT result = (bf16[2,512,364]{2,1,0}) tuple(get-first-index)
+  add.1 = bf16[2,512,364]{2,1,0} add(bf16[2,512,364]{2,1,0} get-first-index, bf16[2,512,364]{2,1,0} get-first-index)
+  ROOT result = (bf16[2,512,364]{2,1,0}) tuple(add.1)
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
-  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Add(op::Convert(), op::Convert()))));
 }
 
 TEST_F(ConditionalCodeMotionTest, UserShareOperandCannotBeMoved) {
@@ -123,7 +206,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
 
   const HloInstruction* conditional =
@@ -181,7 +264,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
   const HloInstruction* conditional =
       FindInstruction(module.get(), "conditional");
@@ -245,7 +328,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
 
   const HloInstruction* conditional =
@@ -317,7 +400,7 @@ ENTRY main {
 )";
 
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
 }
 
@@ -390,7 +473,7 @@ ENTRY main {
 }
 )";
   auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
-  ConditionalCodeMotion pass;
+  ConditionalCodeMotion pass(true, true);
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
   const HloInstruction* conditional =
       FindInstruction(module.get(), "conditional");

From 0b0eef4031fa2674a2c5d32aa7570a82c3def6a8 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Thu, 18 Jun 2020 20:59:28 -0700
Subject: [PATCH 090/112] Move enabling xnnpack delegate to AllocateTensors to
 allow other delegates to be applied first.

PiperOrigin-RevId: 317240424
Change-Id: I89b616f891f65f7cff6beedbf5c2a372f7456592
---
 tensorflow/lite/interpreter.cc         | 15 +++++++++++++--
 tensorflow/lite/interpreter.h          | 15 +++++++++++----
 tensorflow/lite/interpreter_builder.cc | 17 ++++++-----------
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index cae2ca7dde0..b49aa5031bf 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -86,8 +86,9 @@ TfLiteQuantization GetQuantizationFromLegacy(
 }  // namespace
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
+    : error_reporter_(error_reporter ? error_reporter : DefaultErrorReporter()),
+      lazy_delegate_provider_(
+          TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {})) {
   // TODO(b/128420794): Include the TFLite runtime version in the log.
   // Prod logging is useful for mobile platforms where scraping console logs is
   // critical for debugging.
@@ -175,6 +176,16 @@ TfLiteStatus Interpreter::SetVariables(std::vector<int> variables) {
 }
 
 TfLiteStatus Interpreter::AllocateTensors() {
+  // Apply the default delegate that TFLite will enable at this point to allow
+  // other user-level delegates to be applied first.
+  if (lazy_delegate_provider_) {
+    // The execution will fall back to default implementation if the XNNPACK
+    // delegate fails to be applied. Therefore, we ignore the return status
+    // here and let it fall through the rest of the code.
+    ModifyGraphWithDelegate(std::move(lazy_delegate_provider_));
+    lazy_delegate_provider_.reset();
+  }
+
   return primary_subgraph().AllocateTensors();
 }
 
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 59cab6add6d..41377c4ce1f 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -347,10 +347,12 @@ class Interpreter {
   /// WARNING: Experimental interface, subject to change
   TfLiteStatus ReleaseNonPersistentMemory();
 
-  /// Update allocations for all tensors. This will redim dependent tensors
-  /// using the input tensor dimensionality as given. This is relatively
-  /// expensive. If you know that your sizes are not changing, you need not call
-  /// this. Returns status of success or failure.
+  // Update allocations for all tensors. This will redim dependent tensors
+  // using the input tensor dimensionality as given. This is relatively
+  // expensive. This *must be* called after the interpreter has been created
+  // and before running inference (and accessing tensor buffers), and *must be*
+  // called again if (and only if) an input tensor is resized. Returns status of
+  // success or failure.
   TfLiteStatus AllocateTensors();
 
   /// Invoke the interpreter (run the whole graph in dependency order).
@@ -594,6 +596,11 @@ class Interpreter {
 
   // A map of resources. Owned by interpreter and shared by multiple subgraphs.
   resource::ResourceMap resources_;
+
+  // Indicating a delegate that the TFLite interpreter will apply by default.
+  // A nullptr value means there's no delegate to be applied by default or the
+  // delegate has been applied and doesn't need to be applied again.
+  TfLiteDelegatePtr lazy_delegate_provider_;
 };
 
 }  // namespace impl
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index d73b298e595..4b491d41881 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -545,17 +545,7 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
 
 TfLiteStatus InterpreterBuilder::ApplyDelegates(Interpreter* interpreter,
                                                 int num_threads) {
-  // First, apply XNNPACK delegate if applicable.
-  if (num_fp32_tensors_ > 0) {
-    // The execution will fall back to default implementation if the XNNPACK
-    // delegate fails to be applied. Therefore, we ignore the return status
-    // here and let it fall through the rest of the code.
-    if (auto xnnpack_delegate = MaybeCreateXNNPACKDelegate(num_threads)) {
-      interpreter->ModifyGraphWithDelegate(std::move(xnnpack_delegate));
-    }
-  }
-
-  // Secondly, apply Flex delegate if applicable.
+  // Apply Flex delegate if applicable.
   if (has_flex_op_) {
     if (auto flex_delegate = AcquireFlexDelegate()) {
       return interpreter->ModifyGraphWithDelegate(std::move(flex_delegate));
@@ -672,6 +662,11 @@ TfLiteStatus InterpreterBuilder::operator()(
     modified_subgraph->SetVariables(std::move(variables));
   }
 
+  if (num_fp32_tensors_ > 0) {
+    (*interpreter)->lazy_delegate_provider_ =
+        MaybeCreateXNNPACKDelegate(num_threads);
+  }
+
   if (ApplyDelegates(interpreter->get(), num_threads) != kTfLiteOk)
     return cleanup_and_error();
 

From b5bb616121f1805c5ff5391daf00c86b6bcad1ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 21:06:35 -0700
Subject: [PATCH 091/112] *** Reason for rollback ***

 CL316577397 prevents H2D prefetch overlapping for TF2 MLPerf Resnet model on V100x8
More details in b/159372996

  With the rollback, the prefetching overlapping is back, and the training speed also recovered.
*** Original change description ***

Add DT_BOOL support to GPU variable ops

This is a follow-on to PR #38848 & PR #39172 and resolves remaining ask
in Issue #35994. The original PR tried to add many variable ops on the
GPU including DT_BOOL. However, this caused testCondModifyBoolPred to
fail and thus the DT_BOOL type was removed. The reason for the test
failure is once DT_BOOL variables are supported on the GPU, we need to
ensure the switch ops are also updated to not have ho...

***

PiperOrigin-RevId: 317241338
Change-Id: Id7b7d79622e0537ccb677f081b487014ac4d2395
---
 tensorflow/core/kernels/control_flow_ops.cc            | 10 +++++-----
 tensorflow/core/kernels/variable_ops.cc                |  3 ++-
 .../debug/lib/debug_graph_reconstruction_test.py       |  6 +++---
 tensorflow/python/ops/control_flow_ops_test.py         |  6 +++---
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 435de3c5954..c8e83b6f672 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -111,17 +111,15 @@ REGISTER_GPU_SWITCH(uint64);
 TF_CALL_variant(REGISTER_GPU_SWITCH);
 TF_CALL_uint32(REGISTER_GPU_SWITCH);
 TF_CALL_uint32(REGISTER_GPU_REF_SWITCH);
-TF_CALL_bool(REGISTER_GPU_SWITCH);
-TF_CALL_bool(REGISTER_GPU_REF_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
 #undef REGISTER_GPU_SWITCH
 #undef REGISTER_GPU_REF_SWITCH
 
-// Special GPU kernels for int32, string & resource handles. Requiring all
-// inputs and outputs to be in host memory.
-// TODO(b/25387198): Also enable int32 in device memory.
+// Special GPU kernels for int32 and string.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
 #define REGISTER_GPU_HOST_KERNEL(type)                    \
   REGISTER_KERNEL_BUILDER(Name("Switch")                  \
                               .Device(DEVICE_GPU)         \
@@ -151,6 +149,8 @@ TF_CALL_bool(REGISTER_GPU_REF_SWITCH);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_REF_KERNEL(int32);
+REGISTER_GPU_HOST_KERNEL(bool);
+REGISTER_GPU_HOST_REF_KERNEL(bool);
 REGISTER_GPU_HOST_KERNEL(tstring);
 REGISTER_GPU_HOST_REF_KERNEL(tstring);
 REGISTER_GPU_HOST_KERNEL(ResourceHandle);
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index ccd33e8c75a..6f5e0b94eca 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -252,7 +252,8 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
 
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_uint32(REGISTER_GPU_KERNELS);
-TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index b3baa6e7bc2..fb722efab4e 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -73,9 +73,9 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
           for attr_key in new_node.attr:
             if attr_key == "parallel_iterations":
               new_node.attr[attr_key].i = 1
-        elif new_node.op == "Switch" or new_node.op == "Identity":
-          # We don't check the inputs to Switch or Identity ops as their inputs
-          # may be Send/Recv nodes.
+        elif new_node.op == "Switch":
+          # We don't check the inputs to Switch ops as their inputs may be
+          # Send/Recv nodes.
           del new_node.input[:]
 
     return output_graph_def
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 3ca9bda82f2..9254695d988 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -396,10 +396,10 @@ class CondTest(test_util.TensorFlowTestCase):
         fn2=lambda: math_ops.add(y, 23))
     self.assertEquals(self.evaluate(z), 24)
 
-  @test_util.run_v1_only("Exercises Ref variables")
+  @test_util.run_deprecated_v1
   def testCondModifyBoolPred(self):
-    # We want to use the GPU here because we want to ensure that we can update
-    # a boolean ref variable on the GPU.
+    # This test in particular used to fail only when running in GPU, hence
+    # use_gpu=True.
     with test_util.use_gpu():
       bool_var = variable_scope.get_variable(
           "bool_var", dtype=dtypes.bool, initializer=True)

From cfbdd27fe3f2b904609e1551490a01640ae4fcac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 21:17:54 -0700
Subject: [PATCH 092/112] Update ops-related pbtxt files.

PiperOrigin-RevId: 317242544
Change-Id: I36000fdb2d595b5006ea111105ece5ca6f537732
---
 .../core/ops/compat/ops_history_v2/Acos.pbtxt | 29 +++++++++++++++++++
 .../core/ops/compat/ops_history_v2/Asin.pbtxt | 29 +++++++++++++++++++
 .../core/ops/compat/ops_history_v2/Atan.pbtxt | 29 +++++++++++++++++++
 .../core/ops/compat/ops_history_v2/Inv.pbtxt  | 29 +++++++++++++++++++
 .../core/ops/compat/ops_history_v2/Neg.pbtxt  | 29 +++++++++++++++++++
 .../compat/ops_history_v2/Reciprocal.pbtxt    | 29 +++++++++++++++++++
 .../ops/compat/ops_history_v2/Round.pbtxt     | 29 +++++++++++++++++++
 .../ops/compat/ops_history_v2/Square.pbtxt    | 29 +++++++++++++++++++
 .../core/ops/compat/ops_history_v2/Tan.pbtxt  | 29 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 18 ++++++++++++
 10 files changed, 279 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
index 3ed45186f6e..417dbfc7e7d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Acos.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
index 7df768f7c66..c799ff99169 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Asin.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
index 86f0628ab53..4a80c7a751e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Atan.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
index ca208664617..0c191790030 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Inv.pbtxt
@@ -168,3 +168,32 @@ op {
     }
   }
 }
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
index 77bb4a5872d..864d0257fe4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Neg.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
index 5ea1abe4c9c..7e03554871a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Reciprocal.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
index 4f59b21afd5..c5685dc6143 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Round.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Round"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
index 4d07faf4fd0..6af75b3ddc1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Square.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
index 7dc7f84fd38..80e0b1e22c4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Tan.pbtxt
@@ -78,3 +78,32 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1f1cf7444fb..dbd91c91b65 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -216,6 +216,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -2333,6 +2335,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -2646,6 +2650,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -19442,6 +19448,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -25498,6 +25506,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -35191,6 +35201,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -40686,6 +40698,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -48071,6 +48085,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -50832,6 +50848,8 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64

From b7caba2c42285a8e1cb875bec1664d5b0e6c65e9 Mon Sep 17 00:00:00 2001
From: Ashwin Murthy <ashwinm@google.com>
Date: Thu, 18 Jun 2020 22:10:41 -0700
Subject: [PATCH 093/112] Update RNN conversion tflite g3doc

This uses the content from the blog post/dogfood announcement email

PiperOrigin-RevId: 317248288
Change-Id: I210c64bd54c70aa5b68742d59d6d36fa154e856c
---
 tensorflow/lite/g3doc/convert/rnn.md | 240 +++++++++++++++++++--------
 1 file changed, 167 insertions(+), 73 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/rnn.md b/tensorflow/lite/g3doc/convert/rnn.md
index 52bc287c151..734992c0904 100644
--- a/tensorflow/lite/g3doc/convert/rnn.md
+++ b/tensorflow/lite/g3doc/convert/rnn.md
@@ -1,99 +1,193 @@
-# Convert RNN models
+# TensorFlow RNN conversion to TensorFlow Lite
 
-The TensorFlow Lite interpreter currently implements a subset of TensorFlow
-operations, meaning some model architectures cannot immediately be converted due
-to missing operations.
+## Overview
 
-Some RNN-based architectures are affected by this. The following document
-outlines the current state of play and provides strategies for converting RNN
-models.
+TensorFlow Lite supports converting TensorFlow RNN models to TensorFlow Lite’s
+fused LSTM operators. Fused operators exist to maximize the performance of their
+underlying kernel implementations, as well as provide a higher level interface
+to define complex transformations like quantizatization.
 
-## Currently supported
+Since there are many variants of RNN APIs in TensorFlow, our approach has been
+two fold:
 
-Currently, RNN models using
-[`tf.compat.v1.nn.static_rnn`](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
-can be converted successfully as long as no `sequence_length` is specified.
+1.  Provide **native support for standard TensorFlow RNN APIs** like Keras LSTM.
+    This is the recommended option.
+1.  Provide an **interface** **into the conversion infrastructure for**
+    **user-defined** **RNN implementations** to plug in and get converted to
+    TensorFlow Lite. We provide a couple of out of box examples of such
+    conversion using lingvo’s
+    [LSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123)
+    and
+    [LayerNormalizedLSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L519)
+    RNN interfaces.
 
-The following `tf.compat.v1.nn.rnn_cell` operations work with
-`tf.compat.v1.nn.static_rnn`:
+## Converter API
 
-*   [tf.compat.v1.nn.rnn_cell.LSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/LSTMCell)
-*   [tf.compat.v1.nn.rnn_cell.RNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/RNNCell)
-*   [tf.compat.v1.nn.rnn_cell.GRUCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/GRUCell)
-*   [tf.compat.v1.nn.rnn_cell.BasicLSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/BasicLSTMCell)
-*   [tf.compat.v1.nn.rnn_cell.BasicRNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/BasicRNNCell)
+Currently this feature is available through the
+[tf-nightly](https://pypi.org/project/tf-nightly/) pip or from head. This will
+be available in the TensorFlow 2.3 release.
 
-In addition, TensorFlow Lite provides some experimental drop-in replacements for
-RNN operations that enable dynamic RNN architectures with TensorFlow Lite.
+This conversion functionality is available when converting to TensorFlow Lite
+via a SavedModel or from the Keras model directly. See example usages.
 
-Drop-in replacements are available for the following:
+### From saved model
 
-*   [tf.compat.v1.nn.dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
-*   [tf.compat.v1.nn.bidirectional_dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
-*   [tf.compat.v1.nn.rnn_cell.RNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/RNNCell)
-*   [tf.compat.v1.nn.rnn_cell.LSTMCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/LSTMCell)
+```
+# build a saved model. Here concrete_function is the exported function
+# corresponding to the TensorFlow model containing one or more
+# Keras LSTM layers.
+saved_model, saved_model_dir = build_saved_model_lstm(...)
+saved_model.save(saved_model_dir, save_format="tf", signatures=concrete_func)
 
-## Not currently supported
+# Convert the model.
+converter = TFLiteConverter.from_saved_model(saved_model_dir)
+tflite_model = converter.convert()
+```
 
-TensorFlow Lite does not currently support
-[Control Flow](https://www.tensorflow.org/api_docs/cc/group/control-flow-ops)
-operations. This means that, unless one of the conversion strategies discussed
-in the next section are employed, models built with the following TensorFlow
-functions will not convert successfully:
+### From Keras model
 
-*   [tf.compat.v1.nn.static_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
-    where a `sequence_length` is specified
-*   [tf.compat.v1.nn.dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
-*   [tf.compat.v1.nn.bidirectional_dynamic_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
+```
+# build a Keras model
+keras_model = build_keras_lstm(...)
 
-Note: TensorFlow Lite plans to implement all required Control Flow operations by
-the end of 2019. At this point, all RNN architectures will convert successfully.
+# Convert the model.
+converter = TFLiteConverter.from_keras_model(keras_model)
+tflite_model = converter.convert()
 
-## Conversion strategies
+```
 
-To convert an RNN model that uses the functions specified above, you will have
-to modify its architecture and retrain it. The following strategies can be used.
+## Example
 
-### 1. Refactoring
+Keras LSTM to TensorFlow Lite
+[Colab](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/experimental_new_converter/Keras_LSTM_fusion_Codelab.ipynb)
+illustrates the end to end usage with the TensorFlow Lite interpreter.
 
-The simplest approach, if possible, is to refactor the model architecture to use
-[tf.compat.v1.nn.static_rnn](https://www.tensorflow.org/api_docs/python/tf/nn/static_rnn)
-without `sequence_length`.
+## TensorFlow RNNs APIs supported
 
-### 2. Drop-in replacements that use op hints and fused ops
+### Keras LSTM conversion (recommended)
 
-TensorFlow Lite provides the some experimental drop-in replacements for RNN
-operations that enable dynamic RNN architectures with TensorFlow Lite. Using
-[OpHints](https://www.tensorflow.org/lite/guide/ops_custom#converting_tensorflow_models_to_convert_graphs),
-they run normally during training, but are substituted with special fused ops
-when run by the Lite interpreter.
+We support out-of-the-box conversion of Keras LSTM to TensorFlow Lite. For
+details on how this works please refer to the
+[Keras LSTM interface](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/experimental_new_converter/Keras_LSTM_fusion_Codelab.ipynb)<span style="text-decoration:space;">
+</span>and to the conversion logic
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L627).
 
-The following drop-in replacements are available:
+Also important is to highlight the TensorFlow Lite’s LSTM contract with respect
+to the Keras operation definition:
 
-*   [tf.compat.v1.lite.experimental.nn.dynamic_rnn](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn.py#L41)
-    *   replacement for tf.nn.dynamic_rnn
-*   [tf.compat.v1.lite.experimental.nn.bidirectional_dynamic_rnn](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn.py#L279)
-    *   replacement for tf.nn.bidirectional_dynamic_rnn
-*   [tf.compat.v1.lite.experimental.nn.TfLiteRNNCell](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn_cell.py#L39)
-    *   replacement for tf.nn.rnn_cell.RNNCell
-*   [tf.compat.v1.lite.experimental.nn.TfLiteLSTMCell](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/rnn_cell.py#L159)
-    *   replacement for tf.nn.rnn_cell.LSTMCell
+1.  The dimension 0 of the input tensor is the batch size.
+1.  The dimension 0 of the recurrent\_weight tensor is the number of outputs.
+1.  The **weight** and **recurrent\_kernel** tensors are transposed.
+1.  The transposed weight, transposed recurrent\_kernel and bias tensors are
+    split into 4 equal sized tensors along the dimension 0. These correspond to
+    **input gate, forget gate, cell, and output gate**.
 
-Note: These replacements must be used together. For example, if you are using
-`tf.compat.v1.lite.experimental.nn.dynamic_rnn`, you must combine it with
-`tf.compat.v1.lite.experimental.nn.TfLiteRNNCell` instead of using
-`tf.compat.v1.nn.rnn_cell.RNNCell`.
+#### Keras LSTM Variants
 
-Instead of
-[tf.compat.v1.nn.rnn_cell.MultiRNNCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/MultiRNNCell),
-you should use
-[tf.compat.v1.keras.layers.StackedRNNCells](https://www.tensorflow.org/api_docs/python/tf/keras/layers/StackedRNNCells).
+##### Time major
 
-For a tutorial on using these replacements, see
-[TensorFlow Lite LSTM ops API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/g3doc/README.md).
+Users may choose time-major or no time-major. Keras LSTM adds a time-major
+attribute in the function def attributes. For Unidirectional sequence LSTM, we
+can simply map to unidirecional\_sequence\_lstm's
+[time major attribute](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/ir/tfl_ops.td#L3508).
 
-For a Colab demonstrating these classes, refer to
-[TensorFlowLite_LSTM_Keras_Tutorial](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/examples/lstm/TensorFlowLite_LSTM_Keras_Tutorial.ipynb).
+##### BiDirectional LSTM
 
-Note: There is no replacement available for
-[tf.compat.v1.nn.rnn_cell.GRUCell](https://www.tensorflow.org/api_docs/python/tf/nn/rnn_cell/GRUCell).
+Bidirectional LSTM can be implemented with two Keras LSTM layers, one for
+forward and one for backward, see examples
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/keras/layers/wrappers.py#L381).
+Once we see the go\_backward attribute, we recognize it as backward LSTM, then
+we group forward & backward LSTM together. **This is future work.** Currently,
+this creates two UnidirectionalSequenceLSTM operators in the TensorFlow Lite
+model.
+
+### User-defined LSTM conversion examples
+
+TensorFlow Lite also provides a way to convert user defined LSTM
+implementations. Here we use Lingvo’s LSTM as an example of how that can be
+implemented. For details please refer to the
+[lingvo.LSTMCellSimple interface](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L230)
+and the conversion logic
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123).
+We also provide an example for another of Lingvo’s LSTM definitions in
+[lingvo.LayerNormalizedLSTMCellSimple interface](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L1179)
+and its convertion logic
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L130).
+
+## “Bring your own TensorFlow RNN” to TensorFlow Lite
+
+If a user's RNN interface is different from the standard supported ones, there
+are a couple of options:
+
+**Option 1:** Write adapter code in TensorFlow python to adapt the RNN interface
+to the Keras RNN interface. This means a tf.function with
+[tf\_implements annotation](https://github.com/tensorflow/community/pull/113) on
+the generated RNN interface’s function that is identical to the one generated by
+the Keras LSTM layer. After this, the same conversion API used for Keras LSTM
+will work.
+
+**Option 2:** If the above is not possible (e.g. the Keras LSTM is missing some
+functionality that is currently exposed by TensorFlow Lite’s fused LSTM op like
+layer normalization), then extend the TensorFlow Lite converter by writing
+custom conversion code and plug it into the prepare-composite-functions
+MLIR-pass
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L108).
+The function’s interface should be treated like an API contract and should
+contain the arguments needed to convert to fused TensorFlow Lite LSTM
+operators - i.e. input, bias, weights, projection, layer normalization, etc. It
+is preferable for the tensors passed as arguments to this function to have known
+rank (i.e. RankedTensorType in MLIR). This makes it much easier to write
+conversion code that can assume these tensors as RankedTensorType and helps
+transform them to ranked tensors corresponding to the fused TensorFlow Lite
+operator’s operands.
+
+A complete example of such conversion flow is Lingvo’s LSTMCellSimple to
+TensorFlow Lite conversion.
+
+The LSTMCellSimple in Lingvo is defined
+[here](https://github.com/tensorflow/lingvo/blob/master/lingvo/core/rnn_cell.py#L230).
+Models trained with this LSTM cell can be converted to TensorFlow Lite as
+follows:
+
+1.  Wrap all uses of LSTMCellSimple in a tf.function with a tf\_implements
+    annotation that is labelled as such (e.g. lingvo.LSTMCellSimple would be a
+    good annotation name here). Make sure the tf.function that is generated
+    matches the interface of the function expected in the conversion code. This
+    is a contract between the model author adding the annotation and the
+    conversion code.
+1.  Extend the prepare-composite-functions pass to plug in a custom composite op
+    to TensorFlow Lite fused LSTM op conversion. See
+    [LSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc#L123)
+    conversion code.
+
+    The conversion contract:
+
+1.  **Weight** and **projection** tensors are transposed.
+
+1.  The **{input, recurrent}** to **{cell, input gate, forget gate, output
+    gate}** are extracted by slicing the transposed weight tensor.
+
+1.  The **{bias}** to **{cell, input gate, forget gate, output gate}** are
+    extracted by slicing the bias tensor.
+
+1.  The **projection** is extracted by slicing the transposed projection tensor.
+
+1.  Similar conversion is written for
+    [LayerNormalizedLSTMCellSimple](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc#L519).
+
+1.  The rest of the TensorFlow Lite conversion infrastructure, including all the
+    [MLIR passes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc#L58)
+    defined as well as the final export to TensorFlow Lite flatbuffer can be
+    reused.
+
+## Known issues/limitations
+
+1.  Currently there is support only for converting stateless Keras LSTM (default
+    behavior in Keras). Stateful Keras LSTM conversion is future work.
+1.  It is still possible to model a stateful Keras LSTM layer using the
+    underlying stateless Keras LSTM layer and managing the state explicitly in
+    the user program. Such a TensorFlow program can still be converted to
+    TensorFlow Lite using the feature being described here.
+1.  Bidirectional LSTM is currently modelled as two UnidirectionalSequenceLSTM
+    operators in TensorFlow Lite. This will be replaced with a single
+    BidirectionalSequenceLSTM op.

From 158d4be42d7aea11a395d2f79483ac93289e1bb8 Mon Sep 17 00:00:00 2001
From: Xinyi Wang <wxinyi@google.com>
Date: Thu, 18 Jun 2020 22:15:44 -0700
Subject: [PATCH 094/112] Add get_next_as_optional method for a distributed
 iterator

The function is called on a distributed iterator and returns an `Optional` that contains the next value, the PerReplica input, from Distributed iterator or no value if this `iterator` has reached the end of the sequence.

PiperOrigin-RevId: 317248910
Change-Id: Ide217da1aff1d62f8d0d8f43423be2d859d933d3
---
 .../custom_training_loop_input_test.py        | 49 ++++++++++-
 .../python/distribute/distribute_lib.py       |  4 +
 tensorflow/python/distribute/input_lib.py     | 60 +++++++++++++
 .../python/distribute/input_lib_test.py       | 86 +++++++++++++------
 ...low.distribute.-distributed-iterator.pbtxt |  4 +
 5 files changed, 177 insertions(+), 26 deletions(-)

diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index e4f782810dd..5660b5839ce 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -136,8 +137,52 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
 
   @combinations.generate(
       combinations.combine(
-          distribution=strategy_combinations.tpu_strategies,
-          mode=["eager"]))
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def testGetNextAsOptional(self, distribution):
+    data = [5., 6., 7., 8.]
+    dataset = get_dataset_from_tensor_slices(data).batch(2)
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    iterator = iter(dist_dataset)
+
+    def train_step(data):
+      return math_ops.square(data)
+
+    @def_function.function
+    def run(iterator):
+      return distribution.experimental_local_results(
+          distribution.run(
+              train_step, args=(iterator.get_next_as_optional().get_value(),)))
+
+    self.assert_equal_flattened([[25., 36.]], [run(iterator)])
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def testGetNextAsOptionalExampleUsage(self, distribution):
+    global_batch_size = 2
+    steps_per_loop = 6
+    dataset = dataset_ops.Dataset.range(
+        8, output_type=dtypes.int32).batch(global_batch_size)
+    distributed_iterator = iter(
+        distribution.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def train_fn(distributed_iterator):
+
+      def step_fn(x):
+        return x
+
+      for _ in math_ops.range(steps_per_loop):
+        optional_data = distributed_iterator.get_next_as_optional()
+        if not optional_data.has_value():
+          break
+        distribution.run(step_fn, args=(optional_data.get_value(),))
+
+    train_fn(distributed_iterator)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.tpu_strategies, mode=["eager"]))
   def testFullEagerTPU(self, distribution):
     dataset = get_dataset_from_tensor_slices([5., 6., 7., 8.]).batch(2)
 
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index b6a89463426..ec0b911ebe0 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -200,6 +200,7 @@ import six
 from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
@@ -2879,6 +2880,9 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
     def get_next(self):
       return self._iterator.get_next()
 
+    def get_next_as_optional(self):
+      return iterator_ops.get_next_as_optional(self._iterator)
+
     @deprecated(None, "Use the iterator's `initializer` property instead.")
     def initialize(self):
       """Initialize underlying iterators.
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index ff468af7f87..e4a362a92c6 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -29,6 +29,7 @@ from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import distribute
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context
@@ -235,6 +236,40 @@ class DistributedIteratorInterface(collections.Iterator,
     raise NotImplementedError(
         "DistributedIterator.element_spec() must be implemented in descendants")
 
+  def get_next_as_optional(self):
+    """Returns a `tf.experimental.Optional` that contains the next value for all replicas.
+
+    If the `tf.distribute.DistributedIterator` has reached the end of the
+    sequence, the returned `tf.experimental.Optional` will have no value.
+
+    Example usage:
+
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> global_batch_size = 2
+    >>> steps_per_loop = 2
+    >>> dataset = tf.data.Dataset.range(10).batch(global_batch_size)
+    >>> distributed_iterator = iter(
+    ...     strategy.experimental_distribute_dataset(dataset))
+    >>> def step_fn(x):
+    ...   return x
+    >>> @tf.function
+    ... def train_fn(distributed_iterator):
+    ...   for _ in tf.range(steps_per_loop):
+    ...     optional_data = distributed_iterator.get_next_as_optional()
+    ...     if not optional_data.has_value():
+    ...       break
+    ...     tf.print(strategy.run(step_fn, args=(optional_data.get_value(),)))
+    >>> train_fn(distributed_iterator)
+    ... # ([0 1],)
+    ... # ([2 3],)
+
+    Returns:
+      An `tf.experimental.Optional` object representing the next value from the
+      `tf.distribute.DistributedIterator` (if it has one) or no value.
+    """
+    raise NotImplementedError(
+        "get_next_as_optional() not implemented in descendants")
+
 
 @tf_export("distribute.DistributedDataset", v1=[])
 class DistributedDatasetInterface(collections.Iterable,
@@ -622,6 +657,31 @@ class DistributedIteratorBase(DistributedIteratorInterface):
   def __iter__(self):
     return self
 
+  def get_next_as_optional(self):
+    global_has_value, replicas = _get_next_as_optional(self, self._strategy)
+
+    def return_none():
+      return optional_ops.Optional.empty(self._element_spec)
+
+    def return_value(replicas):
+      """Wraps the inputs for replicas in an `tf.experimental.Optional`."""
+      results = []
+      for i, worker in enumerate(self._input_workers.worker_devices):
+        with ops.device(worker):
+          devices = self._input_workers.compute_devices_for_worker(i)
+          for j, device in enumerate(devices):
+            with ops.device(device):
+              result = replicas[i][j]
+              results.append(result)
+      replicas = results
+
+      return optional_ops.Optional.from_value(
+          distribute_utils.regroup(replicas))
+
+    return control_flow_ops.cond(global_has_value,
+                                 lambda: return_value(replicas),
+                                 lambda: return_none())  # pylint: disable=unnecessary-lambda
+
   def get_next(self, name=None):
     """Returns the next input from the iterator for all replicas."""
     if not self._enable_get_next_as_optional:
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index ff4436c4c8c..7f02d0121d0 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -185,38 +185,76 @@ class DistributedIteratorTestBase(test.TestCase):
       if not ops.executing_eagerly_outside_functions():
         evaluate(control_flow_ops.group(iterator.initializer))
 
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = evaluate(
-            [distribute_utils.select_replica(r, next_element)
-             for r in range(len(devices))])
-        self.assertEqual(len(expected_value), len(computed_value))
-        for i in range(len(expected_value)):
-          self.assertAllEqual(expected_value[i], computed_value[i])
+      def test_get_next(iterator):
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = evaluate([
+              distribute_utils.select_replica(r, next_element)
+              for r in range(len(devices))
+          ])
 
-      with self.assertRaises(errors.OutOfRangeError):
-        next_element = iterator.get_next()
-        evaluate(
-            [distribute_utils.select_replica(r, next_element)
-             for r in range(len(devices))])
+          self.assertEqual(len(expected_value), len(computed_value))
+          for i in range(len(expected_value)):
+            self.assertAllEqual(expected_value[i], computed_value[i])
 
-      # After re-initializing the iterator, should be able to iterate again.
-      if not ops.executing_eagerly_outside_functions():
-        evaluate(control_flow_ops.group(iterator.initializer))
+        with self.assertRaises(errors.OutOfRangeError):
+          next_element = iterator.get_next()
+          evaluate([
+              distribute_utils.select_replica(r, next_element)
+              for r in range(len(devices))
+          ])
+
+        # After re-initializing the iterator, should be able to iterate again.
+        if not ops.executing_eagerly_outside_functions():
+          evaluate(control_flow_ops.group(iterator.initializer))
+        else:
+          if api_type == "wrap_into_iterator":
+            self.skipTest("unsupported test combination")
+          else:
+            iterator = iter(dataset)
+
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = evaluate([
+              distribute_utils.select_replica(r, next_element)
+              for r in range(len(devices))
+          ])
+          self.assertEqual(len(expected_value), len(computed_value))
+          for i in range(len(expected_value)):
+            self.assertAllEqual(expected_value[i], computed_value[i])
+
+      def test_get_next_as_optional(iterator):
+        for expected_value in expected_values:
+          next_element = iterator.get_next_as_optional()
+          computed_value = evaluate([
+              distribute_utils.select_replica(r, next_element.get_value())
+              for r in range(len(devices))
+          ])
+
+          self.assertEqual(len(expected_value), len(computed_value))
+          for i in range(len(expected_value)):
+            self.assertAllEqual(expected_value[i], computed_value[i])
+
+        next_element = iterator.get_next_as_optional()
+        self.assertFalse(self.evaluate(next_element.has_value()))
+        with self.assertRaises(errors.InvalidArgumentError):
+          evaluate([
+              distribute_utils.select_replica(r, next_element.get_value())
+              for r in range(len(devices))
+          ])
+
+      test_get_next(iterator)
+
+      # re-initializing the iterator
+      if not tf2.enabled():
+        self.skipTest("Not testing get_next_as_optional in TF1")
       else:
         if api_type == "wrap_into_iterator":
           self.skipTest("unsupported test combination")
         else:
           iterator = iter(dataset)
 
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = evaluate(
-            [distribute_utils.select_replica(r, next_element)
-             for r in range(len(devices))])
-        self.assertEqual(len(expected_value), len(computed_value))
-        for i in range(len(expected_value)):
-          self.assertAllEqual(expected_value[i], computed_value[i])
+      test_get_next_as_optional(iterator)
 
     if iteration_type == "for_loop" and context.executing_eagerly():
       actual_values = []
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
index f712d9058b9..47899cc4188 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-distributed-iterator.pbtxt
@@ -13,4 +13,8 @@ tf_class {
     name: "get_next"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_next_as_optional"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }

From 539e9cb3a22793aad5d2df885e016f43b81a6a9f Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Thu, 18 Jun 2020 22:33:44 -0700
Subject: [PATCH 095/112] Update quantization docs to use
 TFLiteConverter.from_saved_model() API instead of .from_keras_model() API

PiperOrigin-RevId: 317251205
Change-Id: Ia8166decfa76327e3fd44871b194ffcae0f049f8
---
 .../lite/g3doc/performance/post_training_quantization.md  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index ac584dd4c1c..dcf251e6d3d 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -34,7 +34,7 @@ weights from floating point to integer, which has 8-bits of precision:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]</b>
 tflite_quant_model = converter.convert()
 </pre>
@@ -68,7 +68,7 @@ the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
@@ -96,7 +96,7 @@ the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
@@ -120,7 +120,7 @@ quantization of weights, use the following steps:
 
 <pre>
 import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 <b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 converter.target_spec.supported_types = [tf.float16]</b>
 tflite_quant_model = converter.convert()

From 8e654afea4adba36b94b0f7a3d33a23e788612e0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Jun 2020 23:35:25 -0700
Subject: [PATCH 096/112] tf.numpy: Improve ndarray.__getitem__ to match numpy
 semantics.

PiperOrigin-RevId: 317256717
Change-Id: Ie89b81689f96242e3e9b01568e13937b80aaffc7
---
 .../python/ops/numpy_ops/np_array_ops.py      | 261 ++++++++++++++++--
 tensorflow/python/ops/numpy_ops/np_arrays.py  | 137 ---------
 2 files changed, 245 insertions(+), 153 deletions(-)

diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 906e53c556d..47236d45561 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -20,12 +20,15 @@ from __future__ import division
 from __future__ import print_function
 
 import math
+import numbers
+from typing import Sequence
 import numpy as np
 import six
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -164,9 +167,11 @@ def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):  #
 @np_utils.np_doc_only(np.array)
 def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-outer-name
   """Since Tensors are immutable, a copy is made only if val is placed on a
+
   different device than the current one. Even if `copy` is False, a new Tensor
   may need to be built to satisfy `dtype` and `ndim`. This is used only if `val`
-  is an ndarray or a Tensor."""  # pylint:disable=g-docstring-missing-newline
+  is an ndarray or a Tensor.
+  """  # pylint:disable=g-docstring-missing-newline
   if dtype:
     dtype = np_utils.result_type(dtype)
   if isinstance(val, np_arrays.ndarray):
@@ -215,6 +220,8 @@ def array(val, dtype=None, copy=True, ndmin=0):  # pylint: disable=redefined-out
   result_t = np_utils.cond(
       np_utils.greater(ndmin, ndims), true_fn, lambda: result_t)
   return np_arrays.tensor_to_ndarray(result_t)
+
+
 # pylint: enable=g-short-docstring-punctuation,g-no-space-after-docstring-summary,g-doc-return-or-yield,g-doc-args
 
 
@@ -1446,14 +1453,13 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
   # broadcast.
   arr_shape_original = array_ops.shape(arr)
   indices_shape_original = array_ops.shape(indices)
-  arr_shape = array_ops.tensor_scatter_update(
-      arr_shape_original, [[axis]], [1])
-  indices_shape = array_ops.tensor_scatter_update(
-      indices_shape_original, [[axis]], [1])
-  broadcasted_shape = array_ops.broadcast_dynamic_shape(
-      arr_shape, indices_shape)
-  arr_shape = array_ops.tensor_scatter_update(
-      broadcasted_shape, [[axis]], [arr_shape_original[axis]])
+  arr_shape = array_ops.tensor_scatter_update(arr_shape_original, [[axis]], [1])
+  indices_shape = array_ops.tensor_scatter_update(indices_shape_original,
+                                                  [[axis]], [1])
+  broadcasted_shape = array_ops.broadcast_dynamic_shape(arr_shape,
+                                                        indices_shape)
+  arr_shape = array_ops.tensor_scatter_update(broadcasted_shape, [[axis]],
+                                              [arr_shape_original[axis]])
   indices_shape = array_ops.tensor_scatter_update(
       broadcasted_shape, [[axis]], [indices_shape_original[axis]])
   arr = array_ops.broadcast_to(arr, arr_shape)
@@ -1468,10 +1474,10 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
   swapaxes_ = lambda t: swapaxes(np_utils.tensor_to_ndarray(t), axis, -1).data
 
   dont_move_axis_to_end = math_ops.equal(axis, rank - 1)
-  arr = np_utils.cond(
-      dont_move_axis_to_end, lambda: arr, lambda: swapaxes_(arr))
-  indices = np_utils.cond(
-      dont_move_axis_to_end, lambda: indices, lambda: swapaxes_(indices))
+  arr = np_utils.cond(dont_move_axis_to_end, lambda: arr,
+                      lambda: swapaxes_(arr))
+  indices = np_utils.cond(dont_move_axis_to_end, lambda: indices,
+                          lambda: swapaxes_(indices))
 
   arr_shape = array_ops.shape(arr)
   arr = array_ops.reshape(arr, [-1, arr_shape[-1]])
@@ -1481,8 +1487,231 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
 
   result = array_ops.gather(arr, indices, batch_dims=1)
   result = array_ops.reshape(result, indices_shape)
-  result = np_utils.cond(
-      dont_move_axis_to_end, lambda: result, lambda: swapaxes_(result))
+  result = np_utils.cond(dont_move_axis_to_end, lambda: result,
+                         lambda: swapaxes_(result))
   result.set_shape(possible_result_shape)
 
-  return  np_utils.tensor_to_ndarray(result)
+  return np_utils.tensor_to_ndarray(result)
+
+
+_SLICE_ERORR = (
+    'only integers, slices (`:`), ellipsis (`...`), '
+    'numpy.newaxis (`None`) and integer or boolean arrays are valid indices')
+
+
+def _as_index(idx, need_scalar=True):
+  """Helper function to parse idx as an index.
+
+  Args:
+    idx: index
+    need_scalar: If idx needs to be a scalar value.
+
+  Returns:
+    A pair, (indx, bool). First one is the parsed index and can be a tensor,
+    or scalar integer / Dimension. Second one is True if rank is known to be 0.
+
+  Raises:
+    IndexError: For incorrect indices.
+  """
+  if isinstance(idx, (numbers.Integral, tensor_shape.Dimension)):
+    return idx, True
+  data = asarray(idx).data
+  if data.dtype == dtypes.bool:
+    if data.shape.ndims != 1:
+      # TODO(agarwal): handle higher rank boolean masks.
+      raise NotImplementedError('Need rank 1 for bool index %s' % idx)
+    data = array_ops.where_v2(data)
+    data = array_ops.reshape(data, [-1])
+  if need_scalar and data.shape.rank not in (None, 0):
+    raise IndexError(_SLICE_ERORR + ', got {!r}'.format(idx))
+  np_dtype = data.dtype.as_numpy_dtype
+  if not np.issubdtype(np_dtype, np.integer):
+    raise IndexError(_SLICE_ERORR + ', got {!r}'.format(idx))
+  if data.dtype not in (dtypes.int64, dtypes.int32):
+    # TF slicing can only handle int32/int64. So we need to cast.
+    promoted_dtype = np.promote_types(np.int32, np_dtype)
+    if promoted_dtype == np.int32:
+      data = math_ops.cast(data, dtypes.int32)
+    elif promoted_dtype == np.int64:
+      data = math_ops.cast(data, dtypes.int64)
+    else:
+      raise IndexError(_SLICE_ERORR + ', got {!r}'.format(idx))
+  return data, data.shape.rank == 0
+
+
+def _slice_helper(tensor, slice_spec):
+  """Helper function for __getitem__."""
+  begin, end, strides = [], [], []
+  new_axis_mask, shrink_axis_mask = 0, 0
+  begin_mask, end_mask = 0, 0
+  ellipsis_mask = 0
+  advanced_indices = []
+  shrink_indices = []
+  for index, s in enumerate(slice_spec):
+    if isinstance(s, slice):
+      if s.start is not None:
+        begin.append(_as_index(s.start)[0])
+      else:
+        begin.append(0)
+        begin_mask |= (1 << index)
+      if s.stop is not None:
+        end.append(_as_index(s.stop)[0])
+      else:
+        end.append(0)
+        end_mask |= (1 << index)
+      if s.step is not None:
+        strides.append(_as_index(s.step)[0])
+      else:
+        strides.append(1)
+    elif s is Ellipsis:
+      begin.append(0)
+      end.append(0)
+      strides.append(1)
+      ellipsis_mask |= (1 << index)
+    elif s is array_ops.newaxis:
+      begin.append(0)
+      end.append(0)
+      strides.append(1)
+      new_axis_mask |= (1 << index)
+    else:
+      s, is_scalar = _as_index(s, False)
+      if is_scalar:
+        begin.append(s)
+        end.append(s + 1)
+        strides.append(1)
+        shrink_axis_mask |= (1 << index)
+        shrink_indices.append(index)
+      else:
+        begin.append(0)
+        end.append(0)
+        strides.append(1)
+        begin_mask |= (1 << index)
+        end_mask |= (1 << index)
+        advanced_indices.append((index, s, ellipsis_mask != 0))
+
+  # stack possibly involves no tensors, so we must use op_scope correct graph.
+  with ops.name_scope(
+      None,
+      'strided_slice', [tensor] + begin + end + strides,
+      skip_on_eager=False) as name:
+    if begin:
+      packed_begin, packed_end, packed_strides = (array_ops.stack(begin),
+                                                  array_ops.stack(end),
+                                                  array_ops.stack(strides))
+      if (packed_begin.dtype == dtypes.int64 or
+          packed_end.dtype == dtypes.int64 or
+          packed_strides.dtype == dtypes.int64):
+        if packed_begin.dtype != dtypes.int64:
+          packed_begin = math_ops.cast(packed_begin, dtypes.int64)
+        if packed_end.dtype != dtypes.int64:
+          packed_end = math_ops.cast(packed_end, dtypes.int64)
+        if packed_strides.dtype != dtypes.int64:
+          packed_strides = math_ops.cast(packed_strides, dtypes.int64)
+    else:
+      var_empty = constant_op.constant([], dtype=dtypes.int32)
+      packed_begin = packed_end = packed_strides = var_empty
+    # TODO(agarwal): set_shape on tensor to set rank.
+    tensor = array_ops.strided_slice(
+        tensor,
+        packed_begin,
+        packed_end,
+        packed_strides,
+        begin_mask=begin_mask,
+        end_mask=end_mask,
+        shrink_axis_mask=shrink_axis_mask,
+        new_axis_mask=new_axis_mask,
+        ellipsis_mask=ellipsis_mask,
+        name=name)
+    if not advanced_indices:
+      return tensor
+    advanced_indices_map = {}
+    for index, data, had_ellipsis in advanced_indices:
+      if had_ellipsis:
+        num_shrink = len([x for x in shrink_indices if x > index])
+        dim = index - len(slice_spec) + num_shrink
+      else:
+        num_shrink = len([x for x in shrink_indices if x < index])
+        dim = index - num_shrink
+      advanced_indices_map[dim] = data
+    dims = sorted(advanced_indices_map.keys())
+    dims_contiguous = True
+    if len(dims) > 1:
+      if dims[0] < 0 and dims[-1] >= 0:  # not all same sign
+        dims_contiguous = False
+      else:
+        for i in range(len(dims) - 1):
+          if dims[i] + 1 != dims[i + 1]:
+            dims_contiguous = False
+            break
+    indices = [advanced_indices_map[x] for x in dims]
+    indices = [x.data for x in _promote_dtype(*indices)]
+    indices = np_utils.tf_broadcast(*indices)
+    stacked_indices = array_ops.stack(indices, axis=-1)
+    if not dims_contiguous:
+      tensor = moveaxis(tensor, dims, range(len(dims))).data
+      tensor_shape_prefix = array_ops.shape(
+          tensor, out_type=stacked_indices.dtype)[:len(dims)]
+      stacked_indices = array_ops.where_v2(
+          stacked_indices < 0, stacked_indices + tensor_shape_prefix,
+          stacked_indices)
+      return array_ops.gather_nd(tensor, stacked_indices)
+    # Note that gather_nd does not support gathering from inside the array.
+    # To avoid shuffling data back and forth, we transform the indices and
+    # do a gather instead.
+    rank = np_utils._maybe_static(array_ops.rank(tensor))  # pylint: disable=protected-access
+    dims = [(x + rank if x < 0 else x) for x in dims]
+    shape_tensor = array_ops.shape(tensor, out_type=stacked_indices.dtype)
+    dim_sizes = array_ops.gather(shape_tensor, dims)
+    if len(dims) == 1:
+      stacked_indices = indices[0]
+    stacked_indices = array_ops.where_v2(stacked_indices < 0,
+                                         stacked_indices + dim_sizes,
+                                         stacked_indices)
+    axis = dims[0]
+    if len(dims) > 1:
+      index_scaling = math_ops.cumprod(
+          dim_sizes, reverse=True, exclusive=True)
+      stacked_indices = math_ops.tensordot(
+          stacked_indices, index_scaling, axes=1)
+      flat_shape = array_ops.concat(
+          [shape_tensor[:axis], [-1], shape_tensor[axis + len(dims):]],
+          axis=0)
+      tensor = array_ops.reshape(tensor, flat_shape)
+
+    return array_ops.gather(tensor, stacked_indices, axis=axis)
+
+
+def _as_spec_tuple(slice_spec):
+  """Convert slice_spec to tuple."""
+  if isinstance(slice_spec,
+                Sequence) and not isinstance(slice_spec, np.ndarray):
+    is_index = True
+    for s in slice_spec:
+      if s is None or s is Ellipsis or isinstance(s, (Sequence, slice)):
+        is_index = False
+        break
+      elif isinstance(s, (np_arrays.ndarray, np.ndarray)) and s.ndim != 0:
+        is_index = False
+        break
+    if not is_index:
+      return tuple(slice_spec)
+  return (slice_spec,)
+
+
+def _getitem(self, slice_spec):
+  """Implementation of ndarray.__getitem__."""
+  if (isinstance(slice_spec, bool) or (isinstance(slice_spec, ops.Tensor) and
+                                       slice_spec.dtype == dtypes.bool) or
+      (isinstance(slice_spec, (np.ndarray, np_arrays.ndarray)) and
+       slice_spec.dtype == np.bool)):
+    return np_utils.tensor_to_ndarray(
+        array_ops.boolean_mask(tensor=self.data, mask=slice_spec))
+
+  if not isinstance(slice_spec, tuple):
+    slice_spec = _as_spec_tuple(slice_spec)
+
+  result_t = _slice_helper(self.data, slice_spec)
+  return np_utils.tensor_to_ndarray(result_t)
+
+
+setattr(np_arrays.ndarray, '__getitem__', _getitem)
diff --git a/tensorflow/python/ops/numpy_ops/np_arrays.py b/tensorflow/python/ops/numpy_ops/np_arrays.py
index 8bec8a469a2..88bf4e7499a 100644
--- a/tensorflow/python/ops/numpy_ops/np_arrays.py
+++ b/tensorflow/python/ops/numpy_ops/np_arrays.py
@@ -20,138 +20,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numbers
 import numpy as np
 import six
 
 from tensorflow.python.framework import composite_tensor
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.numpy_ops import np_dtypes
-from tensorflow.python.util import nest
-
-
-_SLICE_TYPE_ERROR = (
-    'Only integers, slices (`:`), ellipsis (`...`), '
-    'tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid '
-    'indices')
-
-_SUPPORTED_SLICE_DTYPES = (dtypes.int32, dtypes.int32_ref, dtypes.int64,
-                           dtypes.int64_ref)
-
-
-def _check_index(idx):
-  """Check if a given value is a valid index into a tensor."""
-  if isinstance(idx, (numbers.Integral, tensor_shape.Dimension)):
-    return
-
-  # Optimistic check. Assumptions:
-  # * any object with a dtype is supported
-  # * any object with a dtype has a sizeable shape attribute.
-  dtype = getattr(idx, 'dtype', None)
-  if (dtype is None or dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
-      idx.shape and len(idx.shape) == 1):
-    # TODO(slebedev): IndexError seems more appropriate here, but it
-    # will break `_slice_helper` contract.
-    raise TypeError(_SLICE_TYPE_ERROR + ', got {!r}'.format(idx))
-
-
-def _is_undefined_dimension(d):
-  return isinstance(d, tensor_shape.Dimension) and d.value is None
-
-
-def _slice_helper(tensor, slice_spec, var=None):
-  """Copied from array_ops._slice_helper, will be merged back later."""
-  if isinstance(slice_spec, bool) or \
-  (isinstance(slice_spec, ops.Tensor) and slice_spec.dtype == dtypes.bool) or \
-  (isinstance(slice_spec, np.ndarray) and slice_spec.dtype == bool):
-    return array_ops.boolean_mask(tensor=tensor, mask=slice_spec)
-
-  if not isinstance(slice_spec, (list, tuple)):
-    slice_spec = [slice_spec]
-
-  begin, end, strides = [], [], []
-  index = 0
-
-  new_axis_mask, shrink_axis_mask = 0, 0
-  begin_mask, end_mask = 0, 0
-  ellipsis_mask = 0
-  for s in slice_spec:
-    if isinstance(s, slice):
-      if s.start is not None and not _is_undefined_dimension(s.start):
-        _check_index(s.start)
-        begin.append(s.start)
-      else:
-        begin.append(0)
-        begin_mask |= (1 << index)
-      if s.stop is not None and not _is_undefined_dimension(s.stop):
-        _check_index(s.stop)
-        end.append(s.stop)
-      else:
-        end.append(0)
-        end_mask |= (1 << index)
-      if s.step is not None and not _is_undefined_dimension(s.step):
-        _check_index(s.step)
-        strides.append(s.step)
-      else:
-        strides.append(1)
-    elif s is Ellipsis:
-      begin.append(0)
-      end.append(0)
-      strides.append(1)
-      ellipsis_mask |= (1 << index)
-    elif s is array_ops.newaxis:
-      begin.append(0)
-      end.append(0)
-      strides.append(1)
-      new_axis_mask |= (1 << index)
-    else:
-      _check_index(s)
-      begin.append(s)
-      end.append(s + 1)
-      strides.append(1)
-      shrink_axis_mask |= (1 << index)
-    index += 1
-
-  # stack possibly involves no tensors, so we must use op_scope correct graph.
-  with ops.name_scope(
-      None,
-      'strided_slice', [tensor] + begin + end + strides,
-      skip_on_eager=False) as name:
-    if begin:
-      packed_begin, packed_end, packed_strides = (array_ops.stack(begin),
-                                                  array_ops.stack(end),
-                                                  array_ops.stack(strides))
-      if (packed_begin.dtype == dtypes.int64 or
-          packed_end.dtype == dtypes.int64 or
-          packed_strides.dtype == dtypes.int64):
-        if packed_begin.dtype != dtypes.int64:
-          packed_begin = math_ops.cast(packed_begin, dtypes.int64)
-        if packed_end.dtype != dtypes.int64:
-          packed_end = math_ops.cast(packed_end, dtypes.int64)
-        if packed_strides.dtype != dtypes.int64:
-          packed_strides = math_ops.cast(packed_strides, dtypes.int64)
-    else:
-      var_empty = constant_op.constant([], dtype=dtypes.int32)
-      packed_begin = packed_end = packed_strides = var_empty
-    return array_ops.strided_slice(
-        tensor,
-        packed_begin,
-        packed_end,
-        packed_strides,
-        begin_mask=begin_mask,
-        end_mask=end_mask,
-        shrink_axis_mask=shrink_axis_mask,
-        new_axis_mask=new_axis_mask,
-        ellipsis_mask=ellipsis_mask,
-        var=var,
-        name=name)
 
 
 def convert_to_tensor(value, dtype=None, dtype_hint=None):
@@ -361,22 +240,6 @@ class ndarray(composite_tensor.CompositeTensor):  # pylint: disable=invalid-name
   def __bool__(self):
     return self.__nonzero__()
 
-  def __getitem__(self, slice_spec):
-    # TODO(srbs): Need to support better indexing.
-    def _gettensor(x):
-      if isinstance(x, ndarray):
-        x = x.data
-      if isinstance(x, ops.Tensor) and x.dtype not in (
-          dtypes.int32, dtypes.int64):
-        # Currently _slice_helper will only work with int32/int64 tensors, but
-        # type inference by numpy can create {u,}int{8,16}, so just cast.
-        x = math_ops.cast(x, dtypes.int32)
-      return x
-    slice_spec = nest.map_structure(_gettensor, slice_spec)
-
-    result_t = _slice_helper(self.data, slice_spec)
-    return tensor_to_ndarray(result_t)
-
   def __iter__(self):
     if not isinstance(self.data, ops.EagerTensor):
       raise TypeError('Iteration over symbolic tensor is not allowed')

From e972c5572634efd188696038e9241b75cdcd69bc Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Fri, 19 Jun 2020 00:07:20 -0700
Subject: [PATCH 097/112] Add uint32 & uint64 to TF_CALL_INTEGRAL_TYPES

Both uint32 & uint64 had been omitted from TF_CALL_INTEGRAL_TYPES due to
suggested concerns of size bloat. In reality it seems that the size
increase is only around 2MB. Further, this fixes #39649 since we are no
longer inadvertently using the XLA_CPU device to perform tf.reduce_mean.

PiperOrigin-RevId: 317259372
Change-Id: Iacf75eaedce198fbef4bd9fd59b6fefa584cbf34
---
 tensorflow/core/framework/register_types.h    | 21 +++++---------
 tensorflow/core/framework/types.cc            |  5 ----
 tensorflow/core/kernels/BUILD                 |  2 ++
 tensorflow/core/kernels/concat_lib_cpu.cc     |  2 --
 tensorflow/core/kernels/concat_op.cc          |  2 --
 tensorflow/core/kernels/constant_op.cc        |  1 -
 tensorflow/core/kernels/control_flow_ops.cc   |  5 ----
 .../core/kernels/data/dataset_test_base.cc    |  2 --
 tensorflow/core/kernels/dense_update_ops.cc   |  1 -
 .../core/kernels/dynamic_partition_op.cc      |  2 --
 tensorflow/core/kernels/fill_functor.cc       |  5 +++-
 tensorflow/core/kernels/gather_op.cc          |  2 --
 tensorflow/core/kernels/identity_op.cc        |  1 -
 tensorflow/core/kernels/ragged_gather_op.cc   |  2 --
 .../kernels/ragged_tensor_from_variant_op.cc  |  2 --
 .../kernels/ragged_tensor_to_tensor_op.cc     |  2 --
 .../kernels/ragged_tensor_to_variant_op.cc    |  2 --
 .../core/kernels/resource_variable_ops.cc     |  1 -
 tensorflow/core/kernels/split_lib_cpu.cc      |  1 -
 tensorflow/core/kernels/split_op.cc           |  1 -
 tensorflow/core/kernels/strided_slice_op.cc   |  2 --
 .../core/kernels/strided_slice_op_impl.h      |  2 --
 tensorflow/core/kernels/topk_op.cc            |  2 --
 .../core/kernels/topk_op_gpu_uint32.cu.cc     | 28 +++++++++++++++++++
 .../core/kernels/topk_op_gpu_uint64.cu.cc     | 28 +++++++++++++++++++
 tensorflow/core/util/batch_util.cc            |  8 ------
 .../core/util/saved_tensor_slice_util.h       |  2 ++
 27 files changed, 71 insertions(+), 63 deletions(-)
 create mode 100644 tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
 create mode 100644 tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc

diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index bc3e5e1743b..0cf6536e8c2 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -153,16 +153,9 @@ limitations under the License.
 #endif  // defined(IS_MOBILE_PLATFORM)  - end of TF_CALL_type defines
 
 // Defines for sets of types.
-
-// TODO(b/111604096): Add uint32 and uint64 to TF_CALL_INTEGRAL_TYPES.
-//
-// The uint32 and uint64 types were introduced in 10/2017 to be used via XLA and
-// thus were not included in TF_CALL_INTEGRAL_TYPES. Including them in
-// TF_CALL_INTEGRAL_TYPES should only happen after evaluating the effect on the
-// TF binary size and performance.
-#define TF_CALL_INTEGRAL_TYPES(m)                                      \
-  TF_CALL_int64(m) TF_CALL_int32(m) TF_CALL_uint16(m) TF_CALL_int16(m) \
-      TF_CALL_uint8(m) TF_CALL_int8(m)
+#define TF_CALL_INTEGRAL_TYPES(m)                                       \
+  TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_int32(m) \
+      TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_FLOAT_TYPES(m) \
   TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
@@ -174,10 +167,10 @@ limitations under the License.
 #define TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
   TF_CALL_INTEGRAL_TYPES(m) TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
 
-#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                              \
-  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)   \
-      TF_CALL_int64(m) TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) \
-          TF_CALL_int8(m)
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                                \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)     \
+      TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_uint16(m) \
+          TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
 
 #define TF_CALL_COMPLEX_TYPES(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
 
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index 97eaec98ffe..d6455e012d0 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -238,11 +238,6 @@ int DataTypeSize(DataType dt) {
     TF_CALL_qint16(CASE);
     TF_CALL_quint16(CASE);
 
-    // uint32 and uint64 aren't included in TF_CALL_POD_TYPES because we
-    // don't want to define kernels for them at this stage to avoid binary
-    // bloat.
-    TF_CALL_uint32(CASE);
-    TF_CALL_uint64(CASE);
     default:
       return 0;
   }
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 279dff92c58..97f974c6af4 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4900,7 +4900,9 @@ tf_kernel_library(
         "topk_op_gpu_double.cu.cc",
         "topk_op_gpu_float.cu.cc",
         "topk_op_gpu_half.cu.cc",
+        "topk_op_gpu_uint64.cu.cc",
         "topk_op_gpu_int64.cu.cc",
+        "topk_op_gpu_uint32.cu.cc",
         "topk_op_gpu_int32.cu.cc",
         "topk_op_gpu_int16.cu.cc",
         "topk_op_gpu_uint16.cu.cc",
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index da73d3d2c56..1dec589d3ff 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -116,8 +116,6 @@ REGISTER(qint8)
 REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
-REGISTER(uint32)
-REGISTER(uint64)
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
     !defined(__ANDROID_TYPES_FULL__)
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index be3e9a67c5f..d3f3a04f33b 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -208,8 +208,6 @@ REGISTER_CONCAT(qint8);
 REGISTER_CONCAT(quint16);
 REGISTER_CONCAT(qint16);
 REGISTER_CONCAT(qint32);
-REGISTER_CONCAT(uint32);
-REGISTER_CONCAT(uint64);
 
 #undef REGISTER_CONCAT
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 4bcbc076446..dc178d17d49 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -211,7 +211,6 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL);
 // the conversion from uint8 to quint8.
 REGISTER_KERNEL(CPU, quint8);
 REGISTER_KERNEL(CPU, quint16);
-REGISTER_KERNEL(CPU, uint32);
 #undef REGISTER_CPU_KERNEL
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index c8e83b6f672..accb2c59540 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -101,16 +101,12 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_REF_SWITCH);
-REGISTER_CPU_SWITCH(uint64);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
-REGISTER_GPU_SWITCH(uint64);
 TF_CALL_variant(REGISTER_GPU_SWITCH);
-TF_CALL_uint32(REGISTER_GPU_SWITCH);
-TF_CALL_uint32(REGISTER_GPU_REF_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
@@ -311,7 +307,6 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
-REGISTER_GPU_KERNEL(uint64);
 TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index b91ab9b733c..e41e35be1e9 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -220,8 +220,6 @@ Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
     break;
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_tstring(CASE);
-    TF_CALL_uint32(CASE);
-    TF_CALL_uint64(CASE);
     // TODO(feihugis): figure out how to support variant tensors.
 #undef CASE
     default:
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 55e4cd7606a..71235fca143 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -98,7 +98,6 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 // uint32 not included in ALL_TYPES
-TF_CALL_uint32(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 // quint16 not included in QUANTIZIED_TYPES
 TF_CALL_quint16(REGISTER_KERNELS);
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 90ed71dccce..95af19c4c48 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -164,8 +164,6 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
       DynamicPartitionOp<T>)
 
 TF_CALL_ALL_TYPES(REGISTER_DYNAMIC_PARTITION);
-// For partitioning fingerprints.
-TF_CALL_uint64(REGISTER_DYNAMIC_PARTITION);
 #undef REGISTER_DYNAMIC_PARTITION
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 10dd3df1915..174a4e45a79 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -45,6 +45,8 @@ DEFINE_SETZERO_CPU(Eigen::half);
 DEFINE_SETZERO_CPU(bfloat16);
 DEFINE_SETZERO_CPU(float);
 DEFINE_SETZERO_CPU(double);
+DEFINE_SETZERO_CPU(uint32);
+DEFINE_SETZERO_CPU(uint64);
 DEFINE_SETZERO_CPU(uint8);
 DEFINE_SETZERO_CPU(int8);
 DEFINE_SETZERO_CPU(uint16);
@@ -96,6 +98,8 @@ DEFINE_SETONE_CPU(Eigen::half);
 DEFINE_SETONE_CPU(bfloat16);
 DEFINE_SETONE_CPU(float);
 DEFINE_SETONE_CPU(double);
+DEFINE_SETONE_CPU(uint32);
+DEFINE_SETONE_CPU(uint64);
 DEFINE_SETONE_CPU(uint8);
 DEFINE_SETONE_CPU(int8);
 DEFINE_SETONE_CPU(uint16);
@@ -137,7 +141,6 @@ struct FillFunctor<Eigen::ThreadPoolDevice, T> {
 TF_CALL_ALL_TYPES(DEFINE_FILL_CPU);
 DEFINE_FILL_CPU(quint8);
 DEFINE_FILL_CPU(quint16);
-DEFINE_FILL_CPU(uint32);
 #undef DEFINE_FILL_CPU
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 6d493a5f2ea..948567e019a 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -211,8 +211,6 @@ TF_CALL_ALL_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 TF_CALL_quint16(REGISTER_GATHER_CPU);
 TF_CALL_qint16(REGISTER_GATHER_CPU);
-TF_CALL_uint32(REGISTER_GATHER_CPU);
-TF_CALL_uint64(REGISTER_GATHER_CPU);
 
 #undef REGISTER_GATHER_CPU
 
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index fd94df9a768..daa8a1ddb25 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -122,7 +122,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(Variant);
-TF_CALL_uint32(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index 88c0d1ebd69..3bf82cba050 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -296,8 +296,6 @@ TF_CALL_tstring(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
-TF_CALL_uint32(REGISTER_CPU_KERNEL);
-TF_CALL_uint64(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
 #undef REGISTER_CPU_KERNEL_WITH_INDEX_TYPE
 
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
index f83bcb38c6c..ad0712e6fd0 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -308,8 +308,6 @@ TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
-TF_CALL_uint64(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 #undef REGISTER_KERNELS_WITH_SPLIT_TYPE
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index d729c43f25a..9ae5d7ffbdc 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -561,8 +561,6 @@ TF_CALL_string(REGISTER_CPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
 TF_CALL_quint16(REGISTER_CPU_KERNEL);
 TF_CALL_qint16(REGISTER_CPU_KERNEL);
-TF_CALL_uint32(REGISTER_CPU_KERNEL);
-TF_CALL_uint64(REGISTER_CPU_KERNEL);
 
 #undef REGISTER_CPU_KERNEL
 
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index 7a5ae1c6240..64c372b005e 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -213,8 +213,6 @@ TF_CALL_tstring(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_quint16(REGISTER_KERNELS);
 TF_CALL_qint16(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
-TF_CALL_uint64(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 #undef REGISTER_KERNELS_WITH_SPLIT_TYPE
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 0fc1d53749f..79a64cb9219 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -512,7 +512,6 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index 0cb0a94d498..a3060e4e90d 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -43,7 +43,6 @@ void Split<Eigen::ThreadPoolDevice, T, NDims>::operator()(
 
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
 DEFINE_CPU_KERNELS(quint8)
-DEFINE_CPU_KERNELS(uint64)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, int NDims>
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index f09740c6198..08575f01f67 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -404,7 +404,6 @@ class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT);
 REGISTER_SPLIT(quint8);
-REGISTER_SPLIT(uint64);
 
 #undef REGISTER_SPLIT
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index ccc1984bb98..b4099213303 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -440,8 +440,6 @@ class StridedSliceAssignOp : public OpKernel {
                           StridedSliceAssignOp<CPUDevice, type, true>)
 
 TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
-TF_CALL_uint32(REGISTER_STRIDED_SLICE);
-TF_CALL_uint64(REGISTER_STRIDED_SLICE);
 
 #undef REGISTER_STRIDED_SLICE
 
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 1ae959b7b3f..5ce1d773e33 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -287,8 +287,6 @@ TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N_GPU);
 #endif  // END GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
-TF_CALL_uint32(DECLARE_FOR_N_CPU);
-TF_CALL_uint64(DECLARE_FOR_N_CPU);
 
 #ifdef TENSORFLOW_USE_SYCL
 #define PREVENT_FOR_N_SYCL(T) \
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index c555b42f005..50325b7bcfe 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -258,7 +258,6 @@ namespace functor {
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 TF_CALL_INTEGRAL_TYPES(DECLARE_GPU_SPEC);
-TF_CALL_uint32(DECLARE_GPU_SPEC);
 
 #undef DECLARE_GPU_SPEC
 
@@ -276,7 +275,6 @@ TF_CALL_uint32(DECLARE_GPU_SPEC);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS);
 TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS);
-TF_CALL_uint32(REGISTER_KERNELS)
 #undef REGISTER_KERNELS
 
 #endif  // end GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
new file mode 100644
index 00000000000..16e2e0e9420
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint32.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint32>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
new file mode 100644
index 00000000000..895247a63a2
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint64.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint64>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index b88c365ced0..e03188b04da 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -182,8 +182,6 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
   switch (element.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
@@ -207,8 +205,6 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   switch (parent.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
@@ -280,8 +276,6 @@ Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
   switch (src.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyContiguousSlices unhandled data type: ",
@@ -308,8 +302,6 @@ Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index) {
   switch (parent->dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
-    TF_CALL_uint32(HANDLE_TYPE);
-    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented(
diff --git a/tensorflow/core/util/saved_tensor_slice_util.h b/tensorflow/core/util/saved_tensor_slice_util.h
index 09b9235b711..1f9768f5163 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.h
+++ b/tensorflow/core/util/saved_tensor_slice_util.h
@@ -116,7 +116,9 @@ TENSOR_PROTO_EXTRACT_TYPE(double, double, double);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex64, scomplex, float);
 TENSOR_PROTO_EXTRACT_TYPE_COMPLEX(complex128, dcomplex, double);
 TENSOR_PROTO_EXTRACT_TYPE(int32, int, int32);
+TENSOR_PROTO_EXTRACT_TYPE(uint32, uint32, uint32);
 TENSOR_PROTO_EXTRACT_TYPE(int64, int64, protobuf_int64);
+TENSOR_PROTO_EXTRACT_TYPE(uint64, uint64, protobuf_uint64);
 TENSOR_PROTO_EXTRACT_TYPE(uint16, int, int32);
 TENSOR_PROTO_EXTRACT_TYPE(uint8, int, int32);
 TENSOR_PROTO_EXTRACT_TYPE(int8, int, int32);

From 9f20b156bc7862fb621756fd5d6744255b1f3735 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 19 Jun 2020 01:13:01 -0700
Subject: [PATCH 098/112] [XLA:GPU] [NFC] Clarify the precondition for the fast
 reduction emitter

PiperOrigin-RevId: 317266013
Change-Id: I384acac279f0db53f195d5b43318c38c87a1739c
---
 tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index b97aa3651c6..01bcf456f75 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -226,6 +226,11 @@ bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce) {
       dims_to_keep.push_back(dim);
     }
   }
+
+  // We support fast codegen for three cases:
+  // 1) Row reduction: (K, R)
+  // 2) Column reduction: (K, R, K)
+  // 3) "Batched" row reduction: (R, K, R)
   if (!LayoutUtil::AreDimensionsConsecutive(input->shape().layout(),
                                             dims_to_keep) &&
       !LayoutUtil::AreDimensionsConsecutive(input->shape().layout(),

From 051d1b70f5f0636316e2651630f0ade554f192c0 Mon Sep 17 00:00:00 2001
From: Stefano Galarraga <galarragas@google.com>
Date: Fri, 19 Jun 2020 01:21:24 -0700
Subject: [PATCH 099/112] Fix NNAPI delegation error on models with MAX/MIN
 operations with scalar quantized operators

PiperOrigin-RevId: 317266736
Change-Id: Ieed8a77685d4ca0d51389b5976addf0de167cfcf
---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.cc |  2 ++
 tensorflow/lite/kernels/maximum_minimum_test.cc   | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index a3a3f9fda4d..1c35ee370c2 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -160,6 +160,8 @@ bool IsScalarInputSupported(int builtin_code) {
     case kTfLiteBuiltinLess:
     case kTfLiteBuiltinLessEqual:
     case kTfLiteBuiltinPow:
+    case kTfLiteBuiltinMaximum:
+    case kTfLiteBuiltinMinimum:
       return true;
     default:
       return false;
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index 2c036e369bd..803fe91c460 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -190,6 +190,17 @@ TEST(MaximumOpTest, Int32WithBroadcastTest_ScalarY) {
                      data1, data2, {1, 0, -1, -2, 2, 2}, /*is_constant=*/true);
 }
 
+TEST(MaximumOpTest, Int8WithBroadcastTest_ScalarY) {
+  std::initializer_list<int8_t> data1 = {1, 0, -1, -2, 3, 11};
+  std::initializer_list<int8_t> data2 = {2};
+  TestModel<int8_t>(BuiltinOperator_MAXIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {}}, {TensorType_INT8, {3, 1, 2}}, data1,
+                    data2, {2, 2, 2, 2, 3, 11}, /*is_constant=*/true);
+  TestModel<int8_t>(BuiltinOperator_MINIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {}}, {TensorType_INT8, {3, 1, 2}}, data1,
+                    data2, {1, 0, -1, -2, 2, 2}, /*is_constant=*/true);
+}
+
 TEST(MaxMinOpTest, Int8Test8D) {
   std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
   std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 123, 1};

From e51b17f4582183a216d3a47450117c5e8cdd387d Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Fri, 19 Jun 2020 01:49:14 -0700
Subject: [PATCH 100/112] Add a small test to cover the mlir generated Tanh GPU
 kernel.

This test is a first step towards being able to ensure that we don't
accidentally break the kernel generation.

PiperOrigin-RevId: 317269120
Change-Id: Iad6bdd7ab7e9fb819a478c947ba6294a191f1099
---
 tensorflow/core/kernels/BUILD                 | 19 +++++
 .../mlir_generated_op_gpu_tanh_test.cc        | 85 +++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 97f974c6af4..0b7a092033b 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4168,6 +4168,25 @@ tf_kernel_library(
     ]),
 )
 
+tf_cuda_cc_test(
+    name = "mlir_generated_op_gpu_tanh_test",
+    size = "small",
+    srcs = if_mlir_generated_gpu_kernels_enabled(["mlir_generated_op_gpu_tanh_test.cc"]),
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":cwise_op",
+        ":ops_testutil",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:device",
+        "//tensorflow/core/common_runtime:device_factory",
+    ],
+)
+
 tf_kernel_library(
     name = "nextafter_op",
     prefix = "nextafter_op",
diff --git a/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc b/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc
new file mode 100644
index 00000000000..39c1d709b1e
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated_op_gpu_tanh_test.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class MlirGeneratedOpGpuTanhTest : public OpsTestBase {
+ protected:
+  void SetUp() override {
+    std::unique_ptr<tensorflow::Device> device_gpu(
+        tensorflow::DeviceFactory::NewDevice("GPU", {},
+                                             "/job:a/replica:0/task:0"));
+    SetDevice(tensorflow::DEVICE_GPU, std::move(device_gpu));
+  }
+  template <typename T, typename RT = T>
+  void RunTanhOp(std::initializer_list<T> input) {
+    TensorShape shape({2, 7});
+    TF_ASSERT_OK(NodeDefBuilder("tanh_op", "Tanh")
+                     .Input(FakeInput(DataTypeToEnum<T>::v()))
+                     .Attr("T", DataTypeToEnum<T>::v())
+                     .Finalize(node_def()));
+
+    TF_ASSERT_OK(InitOp());
+    AddInputFromArray<T>(shape, input);
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor expected_tensor(allocator(), DataTypeToEnum<T>::value, shape);
+    std::vector<T> expected;
+    expected.reserve(input.size());
+    for (const T& inp : input) {
+      expected.push_back(static_cast<T>(std::tanh(static_cast<RT>(inp))));
+    }
+    test::FillValues<T>(&expected_tensor, expected);
+    test::ExpectClose(expected_tensor, *GetOutput(0));
+  }
+};
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhFloat) {
+  RunTanhOp<float>({-18.0f, -9.0f, -1e-6f, -0.0f, 0.0f, 1e-6, 0.1f, 0.2f, 0.3f,
+                    0.5f, 0.7f, 0.9f, 9.0f, 18.0f});
+}
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhDouble) {
+  RunTanhOp<double>({-18.0, -9.0, -1e-6, -0.0, 0.0, 1e-6, 0.1, 0.2, 0.3, 0.5,
+                     0.7, 0.9, 9.0, 18.0});
+}
+
+TEST_F(MlirGeneratedOpGpuTanhTest, TanhHalf) {
+  RunTanhOp<Eigen::half, float>(
+      {static_cast<Eigen::half>(-18.0), static_cast<Eigen::half>(-9.0),
+       static_cast<Eigen::half>(-1e-6), static_cast<Eigen::half>(-0.0),
+       static_cast<Eigen::half>(0.0), static_cast<Eigen::half>(1e-6),
+       static_cast<Eigen::half>(0.1), static_cast<Eigen::half>(0.2),
+       static_cast<Eigen::half>(0.3), static_cast<Eigen::half>(0.5),
+       static_cast<Eigen::half>(0.7), static_cast<Eigen::half>(0.9),
+       static_cast<Eigen::half>(9.0), static_cast<Eigen::half>(18.0)});
+}
+
+}  // namespace
+}  // end namespace tensorflow

From 772433a2a2120d0aefc6c3628c6254d5a1aaf19d Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Fri, 19 Jun 2020 01:58:58 -0700
Subject: [PATCH 101/112] Add flag for using optimized TFLite CPU kernels on
 iOS

This adds new experimental flags to the interpreter options of TFLite Obj-C and
Swift APIs, which can be used for opting in to a set of highly optimized
floating point kernels provided via the XNNPACK delegate. The flags can be used
as follows.

Obj-C:

    TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
    options.useXNNPACK = YES;
    NSError *error;
    TFLInterpreter *interpreter =
        [[TFLInterpreter alloc] initWithModelPath:@"model/path"
                                          options:options
                                            error:&error];

Swift:

    var options = InterpreterOptions()
    options.isXNNPackEnabled = true
    var interpreter = try Interpreter(modelPath: "model/path", options: options)

PiperOrigin-RevId: 317270012
Change-Id: I82aae43c3de13ab08af3c70513e2a458e807b0f1
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  4 ++
 tensorflow/lite/experimental/ios/BUILD.apple  | 18 +++++
 tensorflow/lite/experimental/objc/BUILD.apple |  1 +
 .../objc/TensorFlowLiteObjC-nightly.podspec   |  1 +
 .../objc/TensorFlowLiteObjC.podspec           |  1 +
 .../objc/TensorFlowLiteObjC.podspec.template  |  1 +
 .../objc/apis/TFLInterpreterOptions.h         | 21 ++++++
 .../objc/sources/TFLInterpreter.mm            | 15 +++++
 .../objc/tests/TFLInterpreterOptionsTests.m   |  9 +++
 .../swift/Sources/Interpreter.swift           | 67 ++++++++++++++++---
 .../swift/Tests/InterpreterTests.swift        | 62 +++++++++++------
 11 files changed, 171 insertions(+), 29 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 97e6aea2a6b..eaf7d8f6f03 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -14,6 +14,10 @@ EMSCRIPTEN_LINKOPTS = [
     "-s TOTAL_MEMORY=134217728",
 ]
 
+exports_files([
+    "xnnpack_delegate.h",
+])
+
 cc_library(
     name = "xnnpack_delegate",
     srcs = ["xnnpack_delegate.cc"],
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 1a85b604f9b..7a40ca7b8e7 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -18,10 +18,26 @@ sh_binary(
     ],
 )
 
+# When the static framework is built with bazel, the all header files are moved
+# to the "Headers" directory with no header path prefixes. This auxiliary rule
+# is used for stripping the path prefix to the "common.h" file included by the
+# "xnnpack_delegate.h" header.
+genrule(
+    name = "strip_xnnpack_include_hdr",
+    srcs = ["//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h"],
+    outs = ["xnnpack_delegate.h"],
+    cmd = """
+    sed 's|#include ".*common.h"|#include "common.h"|'\
+    "$(location //tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h)"\
+    > "$@"
+    """,
+)
+
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteC_framework
 tflite_ios_static_framework(
     name = "TensorFlowLiteC_framework",
     hdrs = [
+        ":xnnpack_delegate.h",
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
     ],
@@ -105,6 +121,7 @@ cc_library(
     hdrs = [
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
     ],
     tags = [
         "nobuilder",
@@ -112,6 +129,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index 09d4547813a..d26d90c46a1 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -64,6 +64,7 @@ objc_library(
     visibility = ios_visibility_whitelist(),
     deps = [
         "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
index e039fb57114..eed0f087f44 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', "~> #{s.version}"
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index c673cfad759..5817619a58f 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', "#{s.version}"
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
index fc9e10e4a2c..4ab5753e016 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
@@ -26,6 +26,7 @@ Pod::Spec.new do |s|
     objc_dir + '{apis,sources}/*.{h,m,mm}',
     tfl_dir + 'c/c_api.h',
     tfl_dir + 'c/common.h',
+    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
   ]
   s.module_map = objc_dir + 'apis/framework.modulemap'
   s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly'
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
index 6461fbf0178..d7dbb2bd970 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
@@ -25,6 +25,27 @@ NS_ASSUME_NONNULL_BEGIN
  */
 @property(nonatomic) NSUInteger numberOfThreads;
 
+/**
+ * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK).
+ *
+ * Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided via the
+ * XNNPACK delegate. Currently, this is restricted to a subset of floating point operations.
+ * Eventually, we plan to enable this by default, as it can provide significant performance benefits
+ * for many classes of floating point models. See
+ * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+ * for more details.
+ *
+ * Things to keep in mind when enabling this flag:
+ *
+ *     * Startup time and resize time may increase.
+ *     * Baseline memory consumption may increase.
+ *     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+ *     * Quantized models will not see any benefit.
+ *
+ * WARNING: This is an experimental interface that is subject to change.
+ */
+@property(nonatomic) BOOL useXNNPACK;
+
 /**
  * Initializes a new instance of `TFLInterpreterOptions`.
  *
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
index 94031ee5428..34dd119885d 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -23,6 +23,7 @@
 #import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
 
 #include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -45,6 +46,9 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 /** TfLiteInterpreter backed by C API. */
 @property(nonatomic, nullable) TfLiteInterpreter *interpreter;
 
+/** TfLiteDelegate backed by C API. */
+@property(nonatomic, nullable) TfLiteDelegate *xnnpack_delegate;
+
 @end
 
 @implementation TFLInterpreter
@@ -53,6 +57,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 
 - (void)dealloc {
   TfLiteInterpreterDelete(_interpreter);
+  TfLiteXNNPackDelegateDelete(_xnnpack_delegate);
 }
 
 #pragma mark - Public
@@ -104,6 +109,16 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
       }
       TfLiteInterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);
 
+      if (options.useXNNPACK) {
+        TfLiteXNNPackDelegateOptions xnnpack_options = TfLiteXNNPackDelegateOptionsDefault();
+        if (options.numberOfThreads > 0) {
+          xnnpack_options.num_threads = (int32_t)options.numberOfThreads;
+        }
+
+        _xnnpack_delegate = TfLiteXNNPackDelegateCreate(&xnnpack_options);
+        TfLiteInterpreterOptionsAddDelegate(cOptions, _xnnpack_delegate);
+      }
+
       _interpreter = TfLiteInterpreterCreate(model, cOptions);
       if (_interpreter == nullptr) {
         [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
index 00b800d6af9..286cba98b49 100644
--- a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
@@ -32,6 +32,7 @@ NS_ASSUME_NONNULL_BEGIN
   TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
   XCTAssertNotNil(options);
   XCTAssertEqual(options.numberOfThreads, 0);
+  XCTAssertFalse(options.useXNNPACK);
 }
 
 - (void)testSetNumberOfThread {
@@ -44,6 +45,14 @@ NS_ASSUME_NONNULL_BEGIN
   XCTAssertEqual(options.numberOfThreads, 3);
 }
 
+- (void)testUseXNNPACK {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  options.useXNNPACK = YES;
+  XCTAssertTrue(options.useXNNPACK);
+  options.useXNNPACK = NO;
+  XCTAssertFalse(options.useXNNPACK);
+}
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
index b83c36c4e1d..3567822208d 100644
--- a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -39,6 +39,9 @@ public final class Interpreter {
   /// The underlying `TfLiteInterpreter` C pointer.
   private var cInterpreter: CInterpreter?
 
+  /// The underlying `TfLiteDelegate` C pointer for XNNPACK delegate.
+  private var cXNNPackDelegate: Delegate.CDelegate?
+
   /// Creates a new instance with the given values.
   ///
   /// - Parameters:
@@ -78,6 +81,14 @@ public final class Interpreter {
       )
     }
     delegates?.forEach { TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, $0.cDelegate) }
+
+    // Configure the XNNPack delegate after the other delegates explicitly added by the user.
+    options.map {
+      if $0.isXNNPackEnabled {
+        configureXNNPack(options: $0, cInterpreterOptions: cInterpreterOptions)
+      }
+    }
+
     guard let cInterpreter = TfLiteInterpreterCreate(model.cModel, cInterpreterOptions) else {
       throw InterpreterError.failedToCreateInterpreter
     }
@@ -86,6 +97,7 @@ public final class Interpreter {
 
   deinit {
     TfLiteInterpreterDelete(cInterpreter)
+    TfLiteXNNPackDelegateDelete(cXNNPackDelegate)
   }
 
   /// Invokes the interpreter to perform inference from the loaded graph.
@@ -201,12 +213,13 @@ public final class Interpreter {
     guard case 0...maxIndex = index else {
       throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
     }
-    guard TfLiteInterpreterResizeInputTensor(
-      cInterpreter,
-      Int32(index),
-      shape.int32Dimensions,
-      Int32(shape.rank)
-    ) == kTfLiteOk
+    guard
+      TfLiteInterpreterResizeInputTensor(
+        cInterpreter,
+        Int32(index),
+        shape.int32Dimensions,
+        Int32(shape.rank)
+      ) == kTfLiteOk
     else {
       throw InterpreterError.failedToResizeInputTensor(index: index)
     }
@@ -236,11 +249,11 @@ public final class Interpreter {
     }
 
     #if swift(>=5.0)
-    let status = data.withUnsafeBytes {
-      TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
-    }
+      let status = data.withUnsafeBytes {
+        TfLiteTensorCopyFromBuffer(cTensor, $0.baseAddress, data.count)
+      }
     #else
-    let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
+      let status = data.withUnsafeBytes { TfLiteTensorCopyFromBuffer(cTensor, $0, data.count) }
     #endif  // swift(>=5.0)
     guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor }
     return try input(at: index)
@@ -256,6 +269,18 @@ public final class Interpreter {
       throw InterpreterError.failedToAllocateTensors
     }
   }
+
+  // MARK: - Private
+
+  private func configureXNNPack(options: Options, cInterpreterOptions: OpaquePointer) {
+    var cXNNPackOptions = TfLiteXNNPackDelegateOptionsDefault()
+    if let threadCount = options.threadCount, threadCount > 0 {
+      cXNNPackOptions.num_threads = Int32(threadCount)
+    }
+
+    cXNNPackDelegate = TfLiteXNNPackDelegateCreate(&cXNNPackOptions)
+    TfLiteInterpreterOptionsAddDelegate(cInterpreterOptions, cXNNPackDelegate)
+  }
 }
 
 extension Interpreter {
@@ -265,6 +290,28 @@ extension Interpreter {
     /// indicating that the `Interpreter` will decide the number of threads to use.
     public var threadCount: Int? = nil
 
+    /// Indicates whether an optimized set of floating point CPU kernels, provided by XNNPACK, is
+    /// enabled.
+    ///
+    /// - Experiment:
+    /// Enabling this flag will enable use of a new, highly optimized set of CPU kernels provided
+    /// via the XNNPACK delegate. Currently, this is restricted to a subset of floating point
+    /// operations. Eventually, we plan to enable this by default, as it can provide significant
+    /// performance benefits for many classes of floating point models. See
+    /// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/xnnpack/README.md
+    /// for more details.
+    ///
+    /// - Important:
+    /// Things to keep in mind when enabling this flag:
+    ///
+    ///     * Startup time and resize time may increase.
+    ///     * Baseline memory consumption may increase.
+    ///     * Compatibility with other delegates (e.g., GPU) has not been fully validated.
+    ///     * Quantized models will not see any benefit.
+    ///
+    /// - Warning: This is an experimental interface that is subject to change.
+    public var isXNNPackEnabled: Bool = false
+
     /// Creates a new instance with the default values.
     public init() {}
   }
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
index 8d0140279af..67d8120df4d 100644
--- a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -142,10 +142,12 @@ class InterpreterTests: XCTestCase {
   }
 
   func testResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() {
-    XCTAssertThrowsError(try interpreter.resizeInput(
-      at: AddModel.invalidIndex,
-      to: [2, 2, 3]
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.resizeInput(
+        at: AddModel.invalidIndex,
+        to: [2, 2, 3]
+      )
+    ) { error in
       let maxIndex = AddModel.inputTensorCount - 1
       self.assertEqualErrors(
         actual: error,
@@ -162,10 +164,12 @@ class InterpreterTests: XCTestCase {
   }
 
   func testCopyDataToInputTensorAtIndex_ThrowsInvalidIndex() {
-    XCTAssertThrowsError(try interpreter.copy(
-      AddModel.inputData,
-      toInputAt: AddModel.invalidIndex
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.copy(
+        AddModel.inputData,
+        toInputAt: AddModel.invalidIndex
+      )
+    ) { error in
       let maxIndex = AddModel.inputTensorCount - 1
       self.assertEqualErrors(
         actual: error,
@@ -178,10 +182,12 @@ class InterpreterTests: XCTestCase {
     try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
     try interpreter.allocateTensors()
     let invalidData = Data(count: AddModel.dataCount - 1)
-    XCTAssertThrowsError(try interpreter.copy(
-      invalidData,
-      toInputAt: AddModel.validIndex
-    )) { error in
+    XCTAssertThrowsError(
+      try interpreter.copy(
+        invalidData,
+        toInputAt: AddModel.validIndex
+      )
+    ) { error in
       self.assertEqualErrors(
         actual: error,
         expected: .invalidTensorDataCount(provided: invalidData.count, required: AddModel.dataCount)
@@ -223,12 +229,20 @@ class InterpreterOptionsTests: XCTestCase {
   func testInitWithDefaultValues() {
     let options = Interpreter.Options()
     XCTAssertNil(options.threadCount)
+    XCTAssertFalse(options.isXNNPackEnabled)
   }
 
   func testInitWithCustomValues() {
     var options = Interpreter.Options()
+
     options.threadCount = 2
     XCTAssertEqual(options.threadCount, 2)
+
+    options.isXNNPackEnabled = false
+    XCTAssertFalse(options.isXNNPackEnabled)
+
+    options.isXNNPackEnabled = true
+    XCTAssertTrue(options.isXNNPackEnabled)
   }
 
   func testEquatable() {
@@ -242,6 +256,15 @@ class InterpreterOptionsTests: XCTestCase {
 
     options2.threadCount = 3
     XCTAssertNotEqual(options1, options2)
+
+    options2.threadCount = 2
+    XCTAssertEqual(options1, options2)
+
+    options2.isXNNPackEnabled = true
+    XCTAssertNotEqual(options1, options2)
+
+    options1.isXNNPackEnabled = true
+    XCTAssertEqual(options1, options2)
   }
 }
 
@@ -326,14 +349,15 @@ extension Array {
   init?(unsafeData: Data) {
     guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
     #if swift(>=5.0)
-    self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
+      self = unsafeData.withUnsafeBytes { .init($0.bindMemory(to: Element.self)) }
     #else
-    self = unsafeData.withUnsafeBytes {
-      .init(UnsafeBufferPointer<Element>(
-        start: $0,
-        count: unsafeData.count / MemoryLayout<Element>.stride
-      ))
-    }
+      self = unsafeData.withUnsafeBytes {
+        .init(
+          UnsafeBufferPointer<Element>(
+            start: $0,
+            count: unsafeData.count / MemoryLayout<Element>.stride
+          ))
+      }
     #endif  // swift(>=5.0)
   }
 }

From 4b0a6f818fa8e3c38fd0cf68d9a647f82cf6c93a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 02:01:50 -0700
Subject: [PATCH 102/112] Update GraphDef version to 437.

PiperOrigin-RevId: 317270285
Change-Id: Ib8d1e6dbb565c01d2bdf0304a03be1c1eebbde41
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 546d86e58fa..9a79fc1eddf 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 436  // Updated: 2020/6/18
+#define TF_GRAPH_DEF_VERSION 437  // Updated: 2020/6/19
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From d41d28120e6aac1efbad27523a78cd254434dc4e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 02:02:01 -0700
Subject: [PATCH 103/112] compat: Update forward compatibility horizon to
 2020-06-19

PiperOrigin-RevId: 317270310
Change-Id: Idc8188172496af9f2494c580cdab27558b16e4a8
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 32545ac8463..22988d26cfc 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 18)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 19)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 9d1ec55aed0a4d9baf7302974fefe08546bfad25 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 02:04:36 -0700
Subject: [PATCH 104/112] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/7f0d7f326316

PiperOrigin-RevId: 317270655
Change-Id: Ic80ab697da45212c8d58bcda989e5ee0a330b565
---
 .../mlir/xla/transforms/hlo_legalize_to_lhlo.cc     | 13 +++++++------
 .../compiler/mlir/xla/transforms/legalize_tf.cc     |  4 ++--
 .../mlir/xla/transforms/lhlo_legalize_to_gpu.cc     |  2 +-
 .../xla/transforms/lhlo_legalize_to_llvm_pass.cc    |  2 +-
 .../transforms/lhlo_legalize_to_parallel_loops.cc   |  2 +-
 .../mlir/xla/transforms/xla_legalize_to_linalg.cc   |  4 ++--
 .../xla/service/mlir_gpu/kernel_lowering.cc         |  2 +-
 tensorflow/workspace.bzl                            |  4 ++--
 8 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 1cfe0c12e20..a11b08e0ea6 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -389,10 +389,13 @@ struct HloLegalizeToLhlo
     target.addLegalOp<ModuleTerminatorOp>();
     target.addLegalOp<TensorFromElementsOp>();
     target.addIllegalDialect<xla_hlo::XlaHloDialect>();
+
+    BufferAssignmentTypeConverter converter;
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
       auto inputs = op.getType().getInputs();
-      return std::all_of(inputs.begin(), inputs.end(),
-                         [](Type input) { return input.isa<MemRefType>(); });
+      return llvm::all_of(inputs,
+                          [](Type input) { return input.isa<MemRefType>(); }) &&
+             converter.isLegal(&op.getBody());
     });
     target.addDynamicallyLegalOp<mlir::ReturnOp>([&](mlir::ReturnOp returnOp) {
       return std::all_of(returnOp.operand_type_begin(),
@@ -401,8 +404,7 @@ struct HloLegalizeToLhlo
     });
 
     auto module = getOperation();
-    BufferAssignmentTypeConverter converter;
-    module.walk([&](FuncOp func) {
+    module.walk([&](FuncOp func) -> WalkResult {
       BufferAssignmentPlacer bufferAssignment(func);
       OwningRewritePatternList patterns;
       populateHLOToLHLOConversionPattern(func.getContext(), &bufferAssignment,
@@ -418,8 +420,7 @@ struct HloLegalizeToLhlo
             /*allowMemrefFunctionResults=*/false>(&context, &bufferAssignment,
                                                   &converter, &patterns);
       }
-      return WalkResult(
-          applyPartialConversion(func, target, patterns, &converter));
+      return applyPartialConversion(func, target, patterns);
     });
   }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index b7cad554043..1788cd1b270 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -5238,8 +5238,8 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
     // Fully qualify ReturnOp here as xla_hlo dialect also defines a ReturnOp.
     target.addLegalOp<ModuleOp, FuncOp, ModuleTerminatorOp, ::mlir::ReturnOp>();
     DenseSet<Operation *> nonlegalized_ops;
-    LogicalResult result = applyPartialConversion(
-        op, target, patterns, /*converter=*/nullptr, &nonlegalized_ops);
+    LogicalResult result =
+        applyPartialConversion(op, target, patterns, &nonlegalized_ops);
     // In order to enforce that the conversion result is fully converted,
     // fail if there are any nonlegalized ops in the set.
     if (failed(result) || !nonlegalized_ops.empty()) {
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
index f0eb3cc1a0f..c23b8b49268 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
@@ -177,7 +177,7 @@ struct LhloLegalizeToGpu : public PassWrapper<LhloLegalizeToGpu, FunctionPass> {
     target.addIllegalOp<ReduceOp>();
     auto func = getFunction();
     patterns.insert<LhloReduceToGPULaunchConverter>(func.getContext());
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
index 9b809049290..63265c4a7e7 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
@@ -43,7 +43,7 @@ class TestLhloToLLVMPass
     target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
     target.addIllegalDialect<XlaLhloDialect>();
 
-    if (failed(applyFullConversion(m, target, patterns, &converter))) {
+    if (failed(applyFullConversion(m, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
index b3112d49103..65962c5b7a5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -711,7 +711,7 @@ struct LhloLegalizeToParallelLoops
     target.addIllegalOp<xla_lhlo::ReduceOp, xla_lhlo::ReduceWindowOp,
                         xla_lhlo::SelectAndScatterOp>();
 
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index ad78a01100b..8a2f8ce7d04 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -867,7 +867,7 @@ struct LhloLegalizeToLinalg
 
     auto func = getFunction();
     populateLHLOToLinalgConversionPattern(func.getContext(), &patterns);
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
       signalPassFailure();
     }
   }
@@ -882,7 +882,7 @@ struct HloLegalizeToLinalg
 
     auto func = getFunction();
     xla_hlo::populateHLOToLinalgConversionPattern(func.getContext(), &patterns);
-    if (failed(applyPartialConversion(func, target, patterns, nullptr))) {
+    if (failed(applyPartialConversion(func, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 9d5b52df010..ecd1308be4b 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -552,7 +552,7 @@ class LowerToNVVMPass
     // TODO(csigg): Remove once we support replacing non-root ops.
     target.addLegalOp<::mlir::gpu::GPUModuleOp, ::mlir::gpu::ModuleEndOp,
                       ::mlir::gpu::YieldOp>();
-    if (failed(mlir::applyFullConversion(m, target, patterns, &converter))) {
+    if (failed(mlir::applyFullConversion(m, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 52c573628ac..27eca0ee54f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -710,8 +710,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "92d8ad02e92fed3884169ba5d98056fe4fa5660d"
-    LLVM_SHA256 = "a4995ace7ddaef0c49293dc65771f58ef1fea96ebe1f39aa0a2d6d75d07f6cc7"
+    LLVM_COMMIT = "7f0d7f32631648acf48bc23047635ab5e2058a1a"
+    LLVM_SHA256 = "2f1dbae231b3b8f9c67d6a4f578c8ce29f3aa2831313b34c40ff2edb4014476a"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From c662daf4891a1e6efe64797615c3bd2bebedc5f5 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Fri, 19 Jun 2020 02:22:43 -0700
Subject: [PATCH 105/112] Override CustomCall in MlirHloBuilder

Also, enable mlir bridge for image ops compilers test. ResizeBilinear op
lowering usese CustomCall in case of TPU lowerings.

PiperOrigin-RevId: 317272443
Change-Id: I134c828cdc76552a0cbfdeb7c65532aa986314e2
---
 .../compiler/mlir/xla/ir/mlir_hlo_builder.cc  | 16 ++++++++++++
 .../compiler/mlir/xla/ir/mlir_hlo_builder.h   |  6 +++++
 .../xla/transforms/legalize_tf_with_tf2xla.cc |  8 ++++++
 tensorflow/compiler/tests/BUILD               |  1 +
 tensorflow/compiler/xla/client/xla_builder.cc | 26 ++++++++++++++-----
 tensorflow/compiler/xla/client/xla_builder.h  |  8 ++++++
 6 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index 21b1ac5f0ea..3c11d8e590d 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -132,6 +132,22 @@ StatusOr<XlaOp> MlirHloBuilder::FftInternal(
   return MakeXlaOp(op);
 }
 
+StatusOr<XlaOp> MlirHloBuilder::CustomCallInternal(
+    const string& call_target_name, absl::Span<const XlaOp> operands,
+    const Shape& shape, const string& opaque,
+    absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
+  if (operand_shapes_with_layout.has_value())
+    return Unimplemented(
+        "CustomCall doesn't support operands shapes with layout");
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  auto op = builder_.create<mlir::xla_hlo::CustomCallOp>(
+      loc_, ty, GetValues(operands), builder_.getStringAttr(call_target_name),
+      /*has_side_effect=*/builder_.getBoolAttr(false),
+      builder_.getStringAttr(opaque));
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::ReduceInternal(
     const Shape& shape, absl::Span<const XlaOp> all_operands,
     const XlaComputation& computation,
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index 4b28c32db99..4d7d93af7a7 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -124,6 +124,12 @@ class MlirHloBuilder : public XlaBuilder {
                               FftType fft_type,
                               absl::Span<const int64> fft_length) override;
 
+  StatusOr<XlaOp> CustomCallInternal(const string& call_target_name,
+                                     absl::Span<const XlaOp> operands,
+                                     const Shape& shape, const string& opaque,
+                                     absl::optional<absl::Span<const Shape>>
+                                         operand_shapes_with_layout) override;
+
   StatusOr<XlaOp> ReduceInternal(
       const Shape& shape, absl::Span<const XlaOp> all_operands,
       const XlaComputation& computation,
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index ef79c8868bb..8f96f4d1305 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -88,6 +88,9 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::AddNOp>(),
     TypeID::get<TF::AddV2Op>(),
     TypeID::get<TF::AngleOp>(),
+    TypeID::get<TF::AdjustContrastv2Op>(),
+    TypeID::get<TF::AdjustHueOp>(),
+    TypeID::get<TF::AdjustSaturationOp>(),
     TypeID::get<TF::ApproximateEqualOp>(),
     TypeID::get<TF::ArgMaxOp>(),
     TypeID::get<TF::ArgMinOp>(),
@@ -127,6 +130,7 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::GatherNdOp>(),
     TypeID::get<TF::GreaterEqualOp>(),
     TypeID::get<TF::GreaterOp>(),
+    TypeID::get<TF::HSVToRGBOp>(),
     TypeID::get<TF::IFFT2DOp>(),
     TypeID::get<TF::IFFT3DOp>(),
     TypeID::get<TF::IFFTOp>(),
@@ -157,10 +161,14 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::PowOp>(),
     TypeID::get<TF::RFFT2DOp>(),
     TypeID::get<TF::RFFT3DOp>(),
+    TypeID::get<TF::RGBToHSVOp>(),
     TypeID::get<TF::RealDivOp>(),
     TypeID::get<TF::ReciprocalOp>(),
     TypeID::get<TF::ReciprocalGradOp>(),
     TypeID::get<TF::Relu6GradOp>(),
+    TypeID::get<TF::ResizeBilinearOp>(),
+    TypeID::get<TF::ResizeBilinearGradOp>(),
+    TypeID::get<TF::ResizeNearestNeighborOp>(),
     TypeID::get<TF::ReverseSequenceOp>(),
     TypeID::get<TF::RightShiftOp>(),
     TypeID::get<TF::RintOp>(),
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index b574622efce..034ec82de10 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -770,6 +770,7 @@ tf_xla_py_test(
     size = "small",
     timeout = "long",
     srcs = ["image_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 10,
     tags = [
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index c7b6a7f9491..03ae23ea18b 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -1564,16 +1564,12 @@ XlaOp XlaBuilder::CustomCall(
     const Shape& shape, const string& opaque,
     absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     if (absl::StartsWith(call_target_name, "$")) {
       return InvalidArgument(
           "Invalid custom_call_target \"%s\": Call targets that start with '$' "
           "are reserved for internal use.",
           call_target_name);
     }
-    *instr.mutable_shape() = shape.ToProto();
-    instr.set_custom_call_target(call_target_name);
-    instr.set_backend_config(opaque);
     if (operand_shapes_with_layout.has_value()) {
       if (!LayoutUtil::HasLayout(shape)) {
         return InvalidArgument(
@@ -1586,7 +1582,6 @@ XlaOp XlaBuilder::CustomCall(
             "with constrained layout; given %d shapes, expected %d",
             operand_shapes_with_layout->size(), operands.size());
       }
-      instr.set_constrain_layout(true);
       int64 operand_num = 0;
       for (const Shape& operand_shape : *operand_shapes_with_layout) {
         if (!LayoutUtil::HasLayout(operand_shape)) {
@@ -1595,14 +1590,31 @@ XlaOp XlaBuilder::CustomCall(
               "constrained layout.",
               operand_num);
         }
-        *instr.add_operand_shapes_with_layout() = operand_shape.ToProto();
         ++operand_num;
       }
     }
-    return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
+    return CustomCallInternal(call_target_name, operands, shape, opaque,
+                              operand_shapes_with_layout);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::CustomCallInternal(
+    const string& call_target_name, absl::Span<const XlaOp> operands,
+    const Shape& shape, const string& opaque,
+    absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+  instr.set_custom_call_target(call_target_name);
+  instr.set_backend_config(opaque);
+  if (operand_shapes_with_layout.has_value()) {
+    instr.set_constrain_layout(true);
+    for (const Shape& operand_shape : *operand_shapes_with_layout) {
+      *instr.add_operand_shapes_with_layout() = operand_shape.ToProto();
+    }
+  }
+  return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
+}
+
 XlaOp XlaBuilder::CustomCall(
     const string& call_target_name, absl::Span<const XlaOp> operands,
     const XlaComputation& computation, const Shape& shape, const string& opaque,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index b8af180b83e..3fc26747468 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -527,6 +527,14 @@ class XlaBuilder {
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
+  // Internal version of CustomCall without computation that doesn't do op
+  // specific error handling and expects arguments to be legal. CustomCall
+  // method above calls this method after error handling.
+  virtual StatusOr<XlaOp> CustomCallInternal(
+      const string& call_target_name, absl::Span<const XlaOp> operands,
+      const Shape& shape_with_layout, const string& opaque,
+      absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
+
   XlaOp CustomCall(
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const XlaComputation& computation, const Shape& shape_with_layout,

From b58e6000457f26c7a53a9b945642fbe2baddbf20 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Fri, 19 Jun 2020 04:25:24 -0700
Subject: [PATCH 106/112] [XLA][MLIR] Enable xla_hlo.ReshapeOp ->
 xla_lhlo.ReshapeOp conversion.

PiperOrigin-RevId: 317284676
Change-Id: Ia845183efcfabe77f6eb66d8c56dcbfc82653982
---
 tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc | 1 +
 tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index a11b08e0ea6..446f2aae833 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -464,6 +464,7 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<xla_hlo::RealOp>,
       HloToLhloOpConverter<xla_hlo::RemOp>,
       HloToLhloOpConverter<xla_hlo::RsqrtOp>,
+      HloToLhloOpConverter<xla_hlo::ReshapeOp>,
       HloToLhloOpConverter<xla_hlo::SelectOp>,
       HloToLhloOpConverter<xla_hlo::SignOp>,
       HloToLhloOpConverter<xla_hlo::SqrtOp>,
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
index 4b9397795a1..8d5f27474a5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
@@ -61,6 +61,7 @@ MAP_HLO_TO_LHLO(MulOp);
 MAP_HLO_TO_LHLO(NegOp);
 MAP_HLO_TO_LHLO(RealOp);
 MAP_HLO_TO_LHLO(ReduceOp);
+MAP_HLO_TO_LHLO(ReshapeOp);
 MAP_HLO_TO_LHLO(RemOp);
 MAP_HLO_TO_LHLO(RsqrtOp);
 MAP_HLO_TO_LHLO(SelectOp);

From 0c7e61d6608e29324357fb5f79a9b925281521a0 Mon Sep 17 00:00:00 2001
From: Hanhan Wang <hanchung@google.com>
Date: Fri, 19 Jun 2020 07:09:13 -0700
Subject: [PATCH 107/112] Remove the canonicalize pattern for folding a pad op
 into the following conv op.

Basically rolledback for cl/305641881, the pattern could hurt performance
because the operation can't be fully tiled in Linalg transformation. In this
context, not everyone wants this pattern, so remove it from canonicalize
patterns.

PiperOrigin-RevId: 317302072
Change-Id: I19aa64e14eecccfd738ad3f775f3670974bc68f9
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 56 ----------------
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  2 -
 .../compiler/mlir/xla/tests/canonicalize.mlir | 65 -------------------
 .../mlir/xla/transforms/canonicalize.td       | 51 ---------------
 4 files changed, 174 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index d7950919883..e0fa1da93b8 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -106,53 +106,6 @@ DenseIntElementsAttr BuildSliceLimits(DenseIntElementsAttr start_indices,
   return GetI64ElementsAttr(slice_limits, builder);
 }
 
-// Returns the padding value of the given position. If padding_attr is a
-// nullptr, returns 0.
-static int64_t GetPaddingValue(DenseIntElementsAttr padding_attr,
-                               ArrayRef<uint64_t> index) {
-  if (!padding_attr) return 0;
-  return padding_attr.getValue<int64_t>(index);
-}
-
-static bool IsOnlyPaddingSpatialDims(Value lhs,
-                                     ConvDimensionNumbers dimension_numbers,
-                                     DenseIntElementsAttr edge_padding_low,
-                                     DenseIntElementsAttr edge_padding_high) {
-  const int64_t batch_dim = dimension_numbers.input_batch_dimension().getInt();
-  const int64_t feature_dim =
-      dimension_numbers.input_feature_dimension().getInt();
-  if (edge_padding_low.getValue<int64_t>(batch_dim) ||
-      edge_padding_high.getValue<int64_t>(batch_dim))
-    return false;
-  if (edge_padding_low.getValue<int64_t>(feature_dim) ||
-      edge_padding_high.getValue<int64_t>(feature_dim))
-    return false;
-  return true;
-}
-
-DenseIntElementsAttr BuildConvPaddingAttrs(
-    DenseIntElementsAttr edge_padding_low,
-    DenseIntElementsAttr edge_padding_high, DenseIntElementsAttr padding_attr,
-    ConvDimensionNumbers dimension_numbers, Builder* builder) {
-  SmallVector<int64_t, 4> padding_low, padding_high;
-  for (const auto& dim : dimension_numbers.input_spatial_dimensions()) {
-    unsigned i = dim.getZExtValue();
-    padding_low.push_back(edge_padding_low.getValue<int64_t>(i));
-    padding_high.push_back(edge_padding_high.getValue<int64_t>(i));
-  }
-
-  int rank = padding_low.size();
-  SmallVector<int64_t, 8> padding;
-  for (unsigned i = 0, e = rank; i < e; ++i) {
-    padding.push_back(GetPaddingValue(padding_attr, {i, 0}) + padding_low[i]);
-    padding.push_back(GetPaddingValue(padding_attr, {i, 1}) + padding_high[i]);
-  }
-  // padding_attr.getType() doesn't work because it is an optional attribute,
-  // which can be a nullptr.
-  auto type = RankedTensorType::get({rank, 2}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(type, padding);
-}
-
 #include "tensorflow/compiler/mlir/xla/transforms/generated_canonicalize.inc"
 }  // namespace
 
@@ -2153,14 +2106,5 @@ LogicalResult deriveShapeFromFirstOperand(
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// ConvOp
-//===----------------------------------------------------------------------===//
-
-void ConvOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
-                                         MLIRContext* context) {
-  results.insert<FoldPadIntoConv>(context);
-}
-
 }  // namespace xla_hlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index b1745c73fbf..f92d1c5b85c 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -929,8 +929,6 @@ def HLO_ConvOp : HLO_Op<"convolution", [NoSideEffect]>, BASE_HLO_ConvOp {
   );
 
   let results = (outs HLO_Tensor);
-
-  let hasCanonicalizer = 1;
 }
 
 def HLO_CopyOp: HLO_Op<"copy", [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_CopyOp {
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index ef0f8c4d200..1954c3344df 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -415,71 +415,6 @@ func @fold_copy(%arg : tensor<1x4xf32>) -> tensor<1x4xf32> {
   return %0 : tensor<1x4xf32>
 }
 
-// CHECK-LABEL: func @fold_pad_into_conv_f32
-func @fold_pad_into_conv_f32(%arg0 : tensor<1x32x32x3xf32>,
-                         %arg1 : tensor<7x7x3x64xf32>)
-    -> tensor<1x16x16x64xf32> {
-  //  CHECK-NOT: xla_hlo.pad
-  //      CHECK: xla_hlo.convolution
-  // CHECK-SAME: padding = dense<3> : tensor<2x2xi64>
-  %0 = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "xla_hlo.pad"(%arg0, %0) {
-    edge_padding_high = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    edge_padding_low = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    interior_padding = dense<0> : tensor<4xi64>
-  } : (tensor<1x32x32x3xf32>, tensor<f32>) -> tensor<1x38x38x3xf32>
-  %2 = "xla_hlo.convolution"(%1, %arg1) {
-    batch_group_count = 1 : i64,
-    dimension_numbers = {
-      input_batch_dimension = 0 : i64,
-      input_feature_dimension = 3 : i64,
-      input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-      kernel_input_feature_dimension = 2 : i64,
-      kernel_output_feature_dimension = 3 : i64,
-      kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-      output_batch_dimension = 0 : i64,
-      output_feature_dimension = 3 : i64,
-      output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>
-    },
-    feature_group_count = 1 : i64,
-    padding = dense<0> : tensor<2x2xi64>,
-    window_strides = dense<2> : tensor<2xi64>
-  } : (tensor<1x38x38x3xf32>, tensor<7x7x3x64xf32>) -> tensor<1x16x16x64xf32>
-  return %2 : tensor<1x16x16x64xf32>
-}
-
-// CHECK-LABEL: func @fold_pad_into_conv_i32
-func @fold_pad_into_conv_i32(%arg0 : tensor<1x32x32x3xi32>,
-                         %arg1 : tensor<7x7x3x64xi32>)
-    -> tensor<1x16x16x64xi32> {
-  //  CHECK-NOT: xla_hlo.pad
-  //      CHECK: xla_hlo.convolution
-  // CHECK-SAME: padding = dense<3> : tensor<2x2xi64>
-  %0 = xla_hlo.constant dense<0> : tensor<i32>
-  %1 = "xla_hlo.pad"(%arg0, %0) {
-    edge_padding_high = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    edge_padding_low = dense<[0, 3, 3, 0]> : tensor<4xi64>,
-    interior_padding = dense<0> : tensor<4xi64>
-  } : (tensor<1x32x32x3xi32>, tensor<i32>) -> tensor<1x38x38x3xi32>
-  %2 = "xla_hlo.convolution"(%1, %arg1) {
-    batch_group_count = 1 : i64,
-    dimension_numbers = {
-      input_batch_dimension = 0 : i64,
-      input_feature_dimension = 3 : i64,
-      input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>,
-      kernel_input_feature_dimension = 2 : i64,
-      kernel_output_feature_dimension = 3 : i64,
-      kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>,
-      output_batch_dimension = 0 : i64,
-      output_feature_dimension = 3 : i64,
-      output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>
-    },
-    feature_group_count = 1 : i64,
-    window_strides = dense<2> : tensor<2xi64>
-  } : (tensor<1x38x38x3xi32>, tensor<7x7x3x64xi32>) -> tensor<1x16x16x64xi32>
-  return %2 : tensor<1x16x16x64xi32>
-}
-
 // CHECK-LABEL: func @dynamic_reshape_not_actually_dynamic
 func @dynamic_reshape_not_actually_dynamic(%arg0: tensor<4xf32>, %shape: tensor<2xindex>) -> tensor<4x1xf32> {
   // CHECK: xla_hlo.reshape
diff --git a/tensorflow/compiler/mlir/xla/transforms/canonicalize.td b/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
index b788cb80380..c319551d92a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/xla/transforms/canonicalize.td
@@ -28,54 +28,3 @@ def UnaryEinsumToEinsum : Pat<
   (HLO_UnaryEinsumOp $operand, $equation),
   (HLO_EinsumOp (HLO_ConstOp (GetScalarOfType<1> $operand)),
                 $operand, (UnaryToBinaryEinsumEq $equation))>;
-
-//===----------------------------------------------------------------------===//
-// Conv op patterns.
-//===----------------------------------------------------------------------===//
-
-def IsZero : Attr<CPred<
-  "($_self.isa<DenseFPElementsAttr>() &&"
-  "$_self.cast<DenseFPElementsAttr>().isSplat() &&"
-  "$_self.cast<DenseFPElementsAttr>().getSplatValue<FloatAttr>()"
-  ".getValue().isZero()) ||"
-  "($_self.isa<DenseIntElementsAttr>() &&"
-  "$_self.cast<DenseIntElementsAttr>().isSplat() &&"
-  "$_self.cast<DenseIntElementsAttr>().getSplatValue<IntegerAttr>()"
-  ".getInt() == 0)">>;
-
-def IsOnlyPaddingSpatialDims
-  : Constraint<CPred<"IsOnlyPaddingSpatialDims($0, $1, $2, $3)">>;
-
-def BuildConvPaddingAttrs : NativeCodeCall<
-  "BuildConvPaddingAttrs($0, $1, $2, $3, &$_builder)">;
-
-def FoldPadIntoConv : Pat<
-  (HLO_ConvOp
-    (HLO_PadOp $lhs,
-      (HLO_ConstOp IsZero:$padding_value),
-      $edge_padding_low,
-      $edge_padding_high,
-      IsZero:$interior_padding),
-    $rhs,
-    $window_strides,
-    $padding,
-    $lhs_dilation,
-    $rhs_dilation,
-    $dimension_numbers,
-    $feature_group_count,
-    $batch_group_count,
-    $precision_config),
-  (HLO_ConvOp
-    $lhs,
-    $rhs,
-    $window_strides,
-    (BuildConvPaddingAttrs $edge_padding_low, $edge_padding_high, $padding,
-      $dimension_numbers),
-    $lhs_dilation,
-    $rhs_dilation,
-    $dimension_numbers,
-    $feature_group_count,
-    $batch_group_count,
-    $precision_config),
-    [(IsOnlyPaddingSpatialDims $lhs, $dimension_numbers, $edge_padding_low,
-      $edge_padding_high)]>;

From 9e7d5ef6f25e436fffae03597838294d872404f0 Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Fri, 19 Jun 2020 07:29:02 -0700
Subject: [PATCH 108/112] Full int8 quantization BatchMatMul

PiperOrigin-RevId: 317304259
Change-Id: Icf96d9d129db30b965e36f5c8befd27762b173b2
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |   6 +-
 tensorflow/lite/kernels/batch_matmul.cc       |  92 ++++++++++-
 tensorflow/lite/kernels/batch_matmul_test.cc  | 156 ++++++++++++++++--
 .../kernels/internal/optimized/batch_matmul.h | 106 ++++++++++++
 .../kernels/internal/reference/batch_matmul.h |  93 +++++++++++
 tensorflow/lite/kernels/register.cc           |   4 +-
 .../lite/tools/optimize/operator_property.cc  |   6 +
 .../lite/tools/versioning/op_version.cc       |   2 +
 .../lite/tools/versioning/runtime_version.cc  |   1 +
 9 files changed, 439 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 509c13ae161..33281cc58fb 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -953,14 +953,14 @@ in the batch dimensions and broadcasting.
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32]>:$x,
-    TFL_TensorOf<[F32]>:$y,
+    TFL_TensorOf<[F32, QI8]>:$x,
+    TFL_TensorOf<[F32, QI8]>:$y,
     DefaultValuedAttr<BoolAttr, "false">:$adj_x,
     DefaultValuedAttr<BoolAttr, "false">:$adj_y
   );
 
    let results = (outs
-    TFL_TensorOf<[F32]>:$output
+    TFL_TensorOf<[F32, QI8]>:$output
   );
 
   let hasOptions = 1;
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index 8bc23c9c94a..a414a226504 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <limits>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
@@ -52,6 +53,14 @@ enum KernelType {
 };
 
 struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
   // The index of the temporary tensors where we store transposed LHS/RHS.
   int scratch_tensor_index;
   bool rhs_transposed;
@@ -274,6 +283,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   OpContext op_context(context, node);
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   bool adj_x = op_context.params->adj_x;
   bool adj_y = op_context.params->adj_y;
@@ -282,7 +292,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* rhs_data = GetInput(context, node, kInputRHSTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE_TYPES_EQ(context, lhs_data->type, kTfLiteFloat32);
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (lhs_data->type == kTfLiteInt8) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, lhs_data, rhs_data, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &op_data->output_multiplier, &exponent);
+    op_data->output_shift = exponent;
+    // BatchMatMul has no fused activation functions. Therefore, set
+    // output activation min and max to min and max of int8_t type,
+    // respecitvely.
+    op_data->output_activation_min = std::numeric_limits<int8_t>::min();
+    op_data->output_activation_max = std::numeric_limits<int8_t>::max();
+  }
+
+  TF_LITE_ENSURE(context, lhs_data->type == kTfLiteFloat32 ||
+                              lhs_data->type == kTfLiteInt8);
   TF_LITE_ENSURE(context, rhs_data->type == kTfLiteFloat32 ||
                               rhs_data->type == kTfLiteInt8);
   // Support dimensions between 2 and 4, inclusive.
@@ -433,6 +460,41 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node, OpData* data,
   return kTfLiteOk;
 }
 
+template <KernelType kernel_type>
+TfLiteStatus EvalInt8(TfLiteContext* context, const OpData* data,
+                      const RuntimeShape& lhs_shape, const TfLiteTensor* lhs,
+                      const RuntimeShape& rhs_shape, const TfLiteTensor* rhs,
+                      const RuntimeShape& output_shape, TfLiteTensor* output) {
+  // Reuse params struct from FullyConnected Op.
+  FullyConnectedParams op_params;
+  int32_t input_offset = -lhs->params.zero_point;
+  int32_t filter_offset = -rhs->params.zero_point;
+  int32_t output_offset = output->params.zero_point;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.lhs_cacheable = IsConstantTensor(lhs);
+  op_params.rhs_cacheable = IsConstantTensor(rhs);
+
+  if (kernel_type == kReference) {
+    reference_ops::BatchMatMul(op_params, rhs_shape, GetTensorData<int8_t>(rhs),
+                               lhs_shape, GetTensorData<int8_t>(lhs),
+                               GetTensorShape(output),
+                               GetTensorData<int8_t>(output));
+  } else {
+    optimized_ops::BatchMatMul(op_params, rhs_shape, GetTensorData<int8_t>(rhs),
+                               lhs_shape, GetTensorData<int8_t>(lhs),
+                               GetTensorShape(output),
+                               GetTensorData<int8_t>(output),
+                               CpuBackendContext::GetFromContext(context));
+  }
+  return kTfLiteOk;
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            OpData* data, const RuntimeShape& lhs_shape,
@@ -448,25 +510,39 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     return EvalHybrid<kernel_type>(
         context, node, data, lhs_shape, lhs, rhs_shape, rhs, input_quantized,
         scaling_factors, accum_scratch, row_sums, input_offsets, output);
+  } else if (lhs->type == kTfLiteInt8) {
+    return EvalInt8<kernel_type>(context, data, lhs_shape, lhs, rhs_shape, rhs,
+                                 GetTensorShape(output), output);
   } else {
-    TF_LITE_KERNEL_LOG(context,
-                       "Currently only hybrid quantization is supported.\n");
+    TF_LITE_KERNEL_LOG(
+        context, "Currently only hybrid and int8 quantization is supported.\n");
     return kTfLiteError;
   }
   return kTfLiteOk;
 }
 
-TfLiteTensor* GetRhs(TfLiteContext* context, TfLiteNode* node,
-                     const TfLiteTensor* rhs) {
+TfLiteTensor* GetTempRhs(TfLiteContext* context, TfLiteNode* node,
+                         const TfLiteTensor* rhs) {
   TfLiteTensor* transposed_rhs = GetTemporary(context, node, 1);
   if (rhs->type == kTfLiteInt8) {
-    // Get the quantization params from the weights tensors.
+    // Get the quantization params from the RHS tensor.
     transposed_rhs->params.scale = rhs->params.scale;
     transposed_rhs->params.zero_point = rhs->params.zero_point;
   }
   return transposed_rhs;
 }
 
+TfLiteTensor* GetTempLhs(TfLiteContext* context, TfLiteNode* node,
+                         const TfLiteTensor* lhs) {
+  TfLiteTensor* transposed_lhs = GetTemporary(context, node, 0);
+  if (lhs->type == kTfLiteInt8) {
+    // Get the quantization params from the LHS tensor.
+    transposed_lhs->params.scale = lhs->params.scale;
+    transposed_lhs->params.zero_point = lhs->params.zero_point;
+  }
+  return transposed_lhs;
+}
+
 // Perform a batch matrix multiply on
 // LHS <..., A, B>  X  RHS<..., B, C>
 // where the leading dimensions of LHS and RHS obey broadcasting rules
@@ -491,8 +567,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   bool adj_y = op_context.params->adj_y;
   bool adj_x = op_context.params->adj_x;
 
-  const TfLiteTensor* rhs_tensor = adj_y ? rhs : GetRhs(context, node, rhs);
-  const TfLiteTensor* lhs_tensor = adj_x ? GetTemporary(context, node, 0) : lhs;
+  const TfLiteTensor* rhs_tensor = adj_y ? rhs : GetTempRhs(context, node, rhs);
+  const TfLiteTensor* lhs_tensor = adj_x ? GetTempLhs(context, node, lhs) : lhs;
   if (!adj_y) {
     // TODO(b/154760341) Constant tensors should already be transposed, but
     // we transpose once if necessary for now.
diff --git a/tensorflow/lite/kernels/batch_matmul_test.cc b/tensorflow/lite/kernels/batch_matmul_test.cc
index 5e52479f49b..98df8ebe3db 100644
--- a/tensorflow/lite/kernels/batch_matmul_test.cc
+++ b/tensorflow/lite/kernels/batch_matmul_test.cc
@@ -24,8 +24,19 @@ limitations under the License.
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_BATCH_MATMUL_REF();
+TfLiteRegistration* Register_BATCH_MATMUL_GENERIC_OPTIMIZED();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
+using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 
 template <typename T>
@@ -53,7 +64,20 @@ class BatchMatMulOpModel : public SingleOpModel {
   int output_id_;
 };
 
-TEST(BatchMatMulOpModelTest, Float32Test_Simple) {
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_BATCH_MATMUL_REF()},
+    {"GenericOptimized",
+     ops::builtin::Register_BATCH_MATMUL_GENERIC_OPTIMIZED()},
+});
+
+class BatchMatMulOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(BatchMatMulOpTest, Float32Test_Simple) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {1, 2, 3}},
                                   {TensorType_FLOAT32, {1, 3, 4}});
   model.PopulateTensor<float>(model.lhs(), {1, 2, 3, 4, 5, 6});
@@ -65,7 +89,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Simple) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_SimpleRHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_SimpleRHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {1, 2, 3}},
                                   {TensorType_FLOAT32, {1, 4, 3}}, false, true);
   model.PopulateTensor<float>(model.lhs(), {1, 2, 3, 4, 5, 6});
@@ -77,7 +101,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_SimpleRHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_SimpleLHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_SimpleLHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {1, 3, 2}},
                                   {TensorType_FLOAT32, {1, 3, 4}}, true, false);
   model.PopulateTensor<float>(model.lhs(), {1, 4, 2, 5, 3, 6});
@@ -89,7 +113,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_SimpleLHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_BatchSizeTwo) {
+TEST_P(BatchMatMulOpTest, Float32Test_BatchSizeTwo) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 2, 3}},
                                   {TensorType_FLOAT32, {2, 3, 4}});
   model.PopulateTensor<float>(model.lhs(),
@@ -105,7 +129,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_BatchSizeTwo) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 2, 3}},
                                   {TensorType_FLOAT32, {3, 4}});
   model.PopulateTensor<float>(model.lhs(),
@@ -121,7 +145,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_BroadcastLHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_BroadcastLHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 3, 2}},
                                   {TensorType_FLOAT32, {3, 4}}, true, false);
   model.PopulateTensor<float>(model.lhs(),
@@ -137,7 +161,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_BroadcastLHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 3, 2}},
                                   {TensorType_FLOAT32, {3, 2, 4}});
   model.PopulateTensor<float>(model.lhs(),
@@ -161,7 +185,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2LHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2LHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 2, 3}},
                                   {TensorType_FLOAT32, {3, 2, 4}}, true, false);
   model.PopulateTensor<float>(model.lhs(),
@@ -185,7 +209,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2LHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2RHSAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2RHSAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 3, 2}},
                                   {TensorType_FLOAT32, {3, 4, 2}}, false, true);
   model.PopulateTensor<float>(model.lhs(),
@@ -208,7 +232,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2RHSAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2BothAdjoint) {
+TEST_P(BatchMatMulOpTest, Float32Test_Broadcast2BothAdjoint) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {2, 1, 2, 3}},
                                   {TensorType_FLOAT32, {3, 4, 2}}, true, true);
   model.PopulateTensor<float>(model.lhs(),
@@ -231,7 +255,7 @@ TEST(BatchMatMulOpModelTest, Float32Test_Broadcast2BothAdjoint) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 3, 3, 4}));
 }
 
-TEST(BatchMatMulOpModelTest, Float32Test_BroadcastFromRHS) {
+TEST_P(BatchMatMulOpTest, Float32Test_BroadcastFromRHS) {
   BatchMatMulOpModel<float> model({TensorType_FLOAT32, {4, 5}},
                                   {TensorType_FLOAT32, {3, 1, 5, 2}});
   model.PopulateTensor<float>(
@@ -251,6 +275,10 @@ TEST(BatchMatMulOpModelTest, Float32Test_BroadcastFromRHS) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 1, 4, 2}));
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    BatchMatMulOpTest, BatchMatMulOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 // In the hybrid model the weights are quantized int8. But the input
 // and output are expected to be in float precision.
 class HybridAsymmetricBatchMatMulOpModel : public SingleOpModel {
@@ -304,7 +332,14 @@ class HybridAsymmetricBatchMatMulOpModel : public SingleOpModel {
   int input_size_;
 };
 
-TEST(HybridAsymmetricBatchMatMulOpTest, SimpleTestQuantizedInt8) {
+class HybridAsymmetricBatchMatMulOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(HybridAsymmetricBatchMatMulOpTest, SimpleTestQuantizedInt8) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/3, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 10}},
@@ -335,7 +370,7 @@ TEST(HybridAsymmetricBatchMatMulOpTest, SimpleTestQuantizedInt8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
 }
 
-TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastWeights) {
+TEST_P(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastWeights) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/3, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 2, 10}},
@@ -366,7 +401,7 @@ TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastWeights) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 3}));
 }
 
-TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastBigWeights) {
+TEST_P(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastBigWeights) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/9, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 2, 10}},
@@ -401,7 +436,7 @@ TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastBigWeights) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 9}));
 }
 
-TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastInputs) {
+TEST_P(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastInputs) {
   HybridAsymmetricBatchMatMulOpModel m(
       /*units=*/3, /*batches=*/2,
       /*lhs=*/{TensorType_FLOAT32, {2, 10}},
@@ -431,5 +466,96 @@ TEST(HybridAsymmetricBatchMatMulOpTest, QuantizedInt8BroadcastInputs) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 3}));
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    HybridAsymmetricBatchMatMulOpTest, HybridAsymmetricBatchMatMulOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
+class QuantizedBatchMatMulOpModel : public SingleOpModel {
+ public:
+  QuantizedBatchMatMulOpModel(int units, int batches, const TensorData& lhs,
+                              const TensorData& output = {TensorType_INT8},
+                              bool adj_x = false, bool adj_y = false)
+      : units_(units), batches_(batches) {
+    int total_input_size = 1;
+    for (size_t i = 0; i < lhs.shape.size(); ++i) {
+      total_input_size *= lhs.shape[i];
+    }
+    input_size_ = total_input_size / batches_;
+
+    lhs_id_ = AddInput(lhs);
+    rhs_id_ = AddInput({lhs.type, {input_size_, units_}, lhs.min, lhs.max});
+
+    output_id_ = AddOutput(output);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_MATMUL,
+                 BuiltinOptions_BatchMatMulOptions,
+                 CreateBatchMatMulOptions(builder_, adj_x, adj_y).Union());
+    BuildInterpreter({GetShape(lhs_id_), GetShape(rhs_id_)});
+  }
+
+  template <typename T>
+  void SetWeights(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(rhs_id_, data);
+  }
+
+  template <typename T>
+  void SetInput(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(lhs_id_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_id_);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_id_), GetScale(output_id_),
+                         GetZeroPoint(output_id_));
+  }
+
+ protected:
+  int lhs_id_;
+  int rhs_id_;
+  int output_id_;
+  int units_;
+  int batches_;
+  int input_size_;
+};
+
+class QuantizedBatchMatMulOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt8) {
+  QuantizedBatchMatMulOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*lhs=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128});
+
+  m.SetWeights<int8_t>({
+      1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5,  5,  5,
+      6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10,
+  });
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({23, 23, 23, 57, 57, 57})));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(22, 22, 22, 56, 56, 56));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    QuantizedBatchMatMulOpTest, QuantizedBatchMatMulOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/batch_matmul.h b/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
index 24b5012304f..5e622154d60 100644
--- a/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/optimized/batch_matmul.h
@@ -272,6 +272,112 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
   }
 }
 
+inline void BatchMatMul(const FullyConnectedParams& params,
+                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
+                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
+                        const RuntimeShape& output_shape, int8_t* output_data,
+                        CpuBackendContext* context) {
+  using ::tflite::cpu_backend_gemm::Gemm;
+  using ::tflite::cpu_backend_gemm::GemmParams;
+  using ::tflite::cpu_backend_gemm::MatrixParams;
+
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  // Determine which dimension is the broadcast dimension.
+  auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
+    if (lhs_dim == rhs_dim) return lhs_dim;
+    if (lhs_dim == 1) return rhs_dim;
+    TFLITE_DCHECK_EQ(rhs_dim, 1);
+    return lhs_dim;
+  };
+
+  // Compute the "extent" for iterating on this dimension.
+  // If we are broadcasting, then don't advance (i.e return 0).
+  auto extent = [](const RuntimeShape& shape, int x) {
+    if (shape.Dims(x) == 1) {
+      return 0;
+    }
+    int prod = 1;
+    for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
+      prod *= shape.Dims(i);
+    }
+    return prod;
+  };
+
+  const int batch_dim0 =
+      broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 =
+      broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 =
+      broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  MatrixParams<int8_t> lhs_params;
+  lhs_params.order = cpu_backend_gemm::Order::kRowMajor;
+  lhs_params.rows = lhs_rows;
+  lhs_params.cols = accum_depth;
+  lhs_params.zero_point = -filter_offset;
+
+  MatrixParams<int8_t> rhs_params;
+  rhs_params.order = cpu_backend_gemm::Order::kColMajor;
+  rhs_params.rows = accum_depth;
+  rhs_params.cols = rhs_cols;
+  rhs_params.zero_point = -input_offset;
+
+  MatrixParams<int8_t> dst_params;
+  dst_params.order = cpu_backend_gemm::Order::kColMajor;
+  dst_params.rows = lhs_rows;
+  dst_params.cols = rhs_cols;
+  dst_params.zero_point = output_offset;
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        int8_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
+                                         b1 * batch_dim2 + b2) *
+                                            lhs_rows * rhs_cols;
+
+        GemmParams<int32_t, int8_t> gemm_params;
+        gemm_params.clamp_min = output_activation_min;
+        gemm_params.clamp_max = output_activation_max;
+        gemm_params.multiplier_fixedpoint = output_multiplier;
+        gemm_params.multiplier_exponent = output_shift;
+        cpu_backend_gemm::Gemm(lhs_params, lhs_ptr2, rhs_params, rhs_ptr2,
+                               dst_params, out_ptr, gemm_params, context);
+      }
+    }
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
index 1394bd9da64..05caefaca5d 100644
--- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -217,6 +217,99 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
   }
 }
 
+inline void BatchMatMul(const FullyConnectedParams& params,
+                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
+                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
+                        const RuntimeShape& output_shape, int8_t* output_data) {
+  const RuntimeShape extended_lhs_shape =
+      RuntimeShape::ExtendedShape(5, lhs_shape);
+  const RuntimeShape extended_rhs_shape =
+      RuntimeShape::ExtendedShape(5, rhs_shape);
+
+  // Determine which dimension is the broadcast dimension.
+  auto broadcast_dim = [](int lhs_dim, int rhs_dim) {
+    if (lhs_dim == rhs_dim) return lhs_dim;
+    if (lhs_dim == 1) return rhs_dim;
+    TFLITE_DCHECK_EQ(rhs_dim, 1);
+    return lhs_dim;
+  };
+
+  // Compute the "extent" for iterating on this dimension.
+  // If we are broadcasting, then don't advance (i.e return 0).
+  auto extent = [](const RuntimeShape& shape, int x) {
+    if (shape.Dims(x) == 1) {
+      return 0;
+    }
+    int prod = 1;
+    for (int i = x + 1; i < shape.DimensionsCount(); ++i) {
+      prod *= shape.Dims(i);
+    }
+    return prod;
+  };
+
+  const int batch_dim0 =
+      broadcast_dim(extended_lhs_shape.Dims(0), extended_rhs_shape.Dims(0));
+  const int batch_dim1 =
+      broadcast_dim(extended_lhs_shape.Dims(1), extended_rhs_shape.Dims(1));
+  const int batch_dim2 =
+      broadcast_dim(extended_lhs_shape.Dims(2), extended_rhs_shape.Dims(2));
+
+  const int lhs_ext0 = extent(extended_lhs_shape, 0);
+  const int lhs_ext1 = extent(extended_lhs_shape, 1);
+  const int lhs_ext2 = extent(extended_lhs_shape, 2);
+  const int rhs_ext0 = extent(extended_rhs_shape, 0);
+  const int rhs_ext1 = extent(extended_rhs_shape, 1);
+  const int rhs_ext2 = extent(extended_rhs_shape, 2);
+
+  // Set params for each matrix multiply.
+  const int lhs_rows = extended_lhs_shape.Dims(3);
+  const int rhs_cols = extended_rhs_shape.Dims(4);
+  const int accum_depth = extended_lhs_shape.Dims(4);
+
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  for (int b0 = 0; b0 < batch_dim0; ++b0) {
+    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    for (int b1 = 0; b1 < batch_dim1; ++b1) {
+      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      for (int b2 = 0; b2 < batch_dim2; ++b2) {
+        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        int8_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
+                                         b1 * batch_dim2 + b2) *
+                                            lhs_rows * rhs_cols;
+
+        for (int j = 0; j < rhs_cols; ++j) {
+          for (int i = 0; i < lhs_rows; ++i) {
+            int32_t total = 0;
+            for (int k = 0; k < accum_depth; ++k) {
+              int32 lhs_val = lhs_ptr2[accum_depth * i + k];
+              int32 rhs_val = rhs_ptr2[accum_depth * j + k];
+              total += (lhs_val + filter_offset) * (rhs_val + input_offset);
+            }
+            total = MultiplyByQuantizedMultiplier(total, output_multiplier,
+                                                  output_shift);
+            total += output_offset;
+            total = std::max(total, output_activation_min);
+            total = std::min(total, output_activation_max);
+            const int idx = lhs_rows * j + i;
+            out_ptr[idx] = static_cast<int8_t>(total);
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 90688a2aa1f..c3a4aaad16d 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -289,7 +289,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SCATTER_ND, Register_SCATTER_ND());
   AddBuiltin(BuiltinOperator_DENSIFY, Register_DENSIFY());
   AddBuiltin(BuiltinOperator_SEGMENT_SUM, Register_SEGMENT_SUM());
-  AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL());
+  AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 8a0cbca29e2..f2cb98ef31a 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -88,6 +88,12 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.restrict_same_input_output_scale = true;
       property.version = 2;
       break;
+    case BuiltinOperator_BATCH_MATMUL: {
+      property.inputs = {{0, {}}, {1, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 2;
+      break;
+    }
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     case BuiltinOperator_SPACE_TO_BATCH_ND:
     case BuiltinOperator_SPACE_TO_DEPTH:
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 118e2d420f8..a97b9da47f1 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
@@ -518,6 +519,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     case BuiltinOperator_LESS:
     case BuiltinOperator_LESS_EQUAL:
     case BuiltinOperator_SELECT:
+    case BuiltinOperator_BATCH_MATMUL:
       if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
       }
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 92a7001606f..36976354685 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -58,6 +58,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_AVERAGE_POOL_2D, 2}, "1.14.0"},
               {{BuiltinOperator_AVERAGE_POOL_2D, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_BATCH_MATMUL, 1}, kPendingReleaseVersion},
+              {{BuiltinOperator_BATCH_MATMUL, 2}, kPendingReleaseVersion},
               {{BuiltinOperator_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_CONV_2D, 2}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 3}, "1.14.0"},

From 07c54454eec55c1279c243a3c148eeee81b41ed5 Mon Sep 17 00:00:00 2001
From: Tamara Norman <tamaranorman@google.com>
Date: Fri, 19 Jun 2020 08:48:51 -0700
Subject: [PATCH 109/112] Add an option such that the cached host_value can be
 discarded

PiperOrigin-RevId: 317315157
Change-Id: I9d7145390a526003069321c7e04794e139a53c09
---
 tensorflow/compiler/xla/pjrt/pjrt_client.cc | 6 +++++-
 tensorflow/compiler/xla/pjrt/pjrt_client.h  | 8 ++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index 46f592100c9..b4f0363e69a 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -1077,13 +1077,17 @@ Status PjRtBuffer::CopyToHostAsync() {
   return Status::OK();
 }
 
-StatusOr<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral() {
+StatusOr<std::shared_ptr<Literal>> PjRtBuffer::ToLiteral(
+    const bool discard_cached_copy) {
   tensorflow::profiler::TraceMe traceme("PjRtBuffer::ToLiteral");
   TF_RETURN_IF_ERROR(CopyToHostAsync());
   std::shared_ptr<HostValue> host_value;
   {
     absl::MutexLock lock(&mu_);
     host_value = host_value_;
+    if (discard_cached_copy) {
+      host_value_ = nullptr;
+    }
   }
   if (host_value == nullptr) {
     return InvalidArgument("ToLiteral called on invalid buffer");
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index 754eb19bec6..8f74e6244d6 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -478,8 +478,12 @@ class PjRtBuffer {
 
   // Returns the buffer's value as an XLA Literal. If the value has previously
   // been prefetched to the host, then returns the prefetched version, otherwise
-  // copies the buffer to the host. Blocks until the value is ready.
-  StatusOr<std::shared_ptr<Literal>> ToLiteral();
+  // copies the buffer to the host. Blocks until the value is ready. If
+  // `discard_cached_copy` is true then buffer will no longer keep hold of a
+  // cached copy of the literal (i.e. The reference to the host value will be
+  // removed.)
+  StatusOr<std::shared_ptr<Literal>> ToLiteral(
+      bool discard_cached_copy = false);
 
   // Initiates a copy of the buffer to the host. Does not block waiting for
   // the transfer to complete. The value can be retrieved by a later call to

From 16cb89bd7b40fc816aa7440f62b443ca480bcf6d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Jun 2020 09:15:04 -0700
Subject: [PATCH 110/112] Qualify uses of std::string

PiperOrigin-RevId: 317319501
Change-Id: Ib75a31ad89fa1a6bda81450f2ab5ba07d7338ada
---
 tensorflow/lite/toco/tflite/export.cc         | 58 +++++++++----------
 tensorflow/lite/toco/tflite/export.h          | 12 ++--
 tensorflow/lite/toco/tflite/export_test.cc    | 56 ++++++++++--------
 tensorflow/lite/toco/tflite/import.cc         | 18 +++---
 tensorflow/lite/toco/tflite/import.h          |  6 +-
 tensorflow/lite/toco/tflite/import_test.cc    |  6 +-
 tensorflow/lite/toco/tflite/op_version.cc     |  8 +--
 .../lite/toco/tflite/op_version_test.cc       | 32 +++++-----
 tensorflow/lite/toco/tflite/operator.cc       | 37 ++++++------
 tensorflow/lite/toco/tflite/operator.h        | 12 ++--
 tensorflow/lite/toco/tflite/operator_test.cc  |  8 +--
 tensorflow/lite/toco/tflite/types.cc          |  6 +-
 12 files changed, 133 insertions(+), 126 deletions(-)

diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index d72a902001d..d109ab875b5 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -52,7 +52,7 @@ using ::tflite::Tensor;
 namespace {
 
 // Check if a TensorFlow Op is a control flow op by its name.
-bool IsControlFlowOp(const string& tensorflow_op) {
+bool IsControlFlowOp(const std::string& tensorflow_op) {
   // Technically this is equivalent to `::tensorflow::Node::IsControlFlow()`.
   // It requires to construct a `::tensorflow::Graph` to use that helper
   // function, so we simply hardcode the list of control flow ops here.
@@ -68,7 +68,7 @@ bool IsControlFlowOp(const string& tensorflow_op) {
 }
 
 // Check if a TensorFlow Op is unsupported by the Flex runtime.
-bool IsUnsupportedFlexOp(const string& tensorflow_op) {
+bool IsUnsupportedFlexOp(const std::string& tensorflow_op) {
   if (IsControlFlowOp(tensorflow_op)) {
     return true;
   }
@@ -82,14 +82,14 @@ bool IsUnsupportedFlexOp(const string& tensorflow_op) {
 }
 
 // Map from operator name to TF Lite enum value, for all builtins.
-const std::map<string, BuiltinOperator>& GetBuiltinOpsMap() {
-  static std::map<string, BuiltinOperator>* builtin_ops = nullptr;
+const std::map<std::string, BuiltinOperator>& GetBuiltinOpsMap() {
+  static std::map<std::string, BuiltinOperator>* builtin_ops = nullptr;
   if (builtin_ops == nullptr) {
-    builtin_ops = new std::map<string, BuiltinOperator>();
+    builtin_ops = new std::map<std::string, BuiltinOperator>();
 
     for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
       BuiltinOperator op = static_cast<BuiltinOperator>(i);
-      string name = EnumNameBuiltinOperator(op);
+      std::string name = EnumNameBuiltinOperator(op);
       if (op != BuiltinOperator_CUSTOM && !name.empty()) {
         (*builtin_ops)[name] = op;
       }
@@ -99,10 +99,10 @@ const std::map<string, BuiltinOperator>& GetBuiltinOpsMap() {
 }
 
 void WriteModelToString(const flatbuffers::FlatBufferBuilder& builder,
-                        string* file_contents) {
+                        std::string* file_contents) {
   const uint8_t* buffer = builder.GetBufferPointer();
   int size = builder.GetSize();
-  *file_contents = string(reinterpret_cast<const char*>(buffer), size);
+  *file_contents = std::string(reinterpret_cast<const char*>(buffer), size);
 }
 
 }  // Anonymous namespace.
@@ -115,7 +115,7 @@ OperatorKey::OperatorKey(
     bool enable_select_tf_ops) {
   // Get the op name (by Toco definition).
   const ::toco::Operator& op = *op_signature.op;
-  string name = HelpfulOperatorTypeName(op);
+  std::string name = HelpfulOperatorTypeName(op);
 
   bool is_builtin = false;
   const auto& builtin_ops = GetBuiltinOpsMap();
@@ -146,7 +146,7 @@ OperatorKey::OperatorKey(
       is_flex_op_ = true;
       flex_tensorflow_op_ = tensorflow_op;
       custom_code_ =
-          string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
+          std::string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
     } else {
       custom_code_ = tensorflow_op;
     }
@@ -158,7 +158,7 @@ OperatorKey::OperatorKey(
     is_flex_op_ = true;
     flex_tensorflow_op_ = name;
     custom_code_ =
-        string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
+        std::string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
   } else {
     // If Flex is disabled or the original TensorFlow NodeDef isn't available,
     // we produce a custom op. This gives developers a chance to implement
@@ -175,7 +175,7 @@ OperatorKey::OperatorKey(
 
 void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
   // First find a list of unique array names.
-  std::set<string> names;
+  std::set<std::string> names;
   for (const auto& array_pair : model.GetArrayMap()) {
     names.insert(array_pair.first);
   }
@@ -218,7 +218,7 @@ Offset<Vector<Offset<Tensor>>> ExportTensors(
   std::map<int, Offset<Tensor>> ordered_tensors;
 
   for (const auto& array_pair : model.GetArrayMap()) {
-    const string& tensor_name = array_pair.first;
+    const std::string& tensor_name = array_pair.first;
     const toco::Array& array = *array_pair.second;
 
     int buffer_index = buffers_to_write->size();
@@ -283,7 +283,7 @@ Offset<Vector<int32_t>> ExportOutputTensors(
     const Model& model, const details::TensorsMap& tensors_map,
     FlatBufferBuilder* builder) {
   std::vector<int32_t> outputs;
-  for (const string& output : model.flags.output_arrays()) {
+  for (const std::string& output : model.flags.output_arrays()) {
     outputs.push_back(tensors_map.at(output));
   }
   return builder->CreateVector<int32_t>(outputs);
@@ -295,10 +295,10 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
     const details::OperatorsMap& operators_map, FlatBufferBuilder* builder,
     const ExportParams& params) {
   // Map from operator name to TF Lite enum value, for all builtins.
-  std::map<string, BuiltinOperator> builtin_ops;
+  std::map<std::string, BuiltinOperator> builtin_ops;
   for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
     BuiltinOperator op = static_cast<BuiltinOperator>(i);
-    string name = EnumNameBuiltinOperator(op);
+    std::string name = EnumNameBuiltinOperator(op);
     if (op != BuiltinOperator_CUSTOM && !name.empty()) {
       builtin_ops[name] = op;
     }
@@ -349,13 +349,13 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
   std::vector<Offset<Operator>> op_vector;
   for (const auto& op : model.operators) {
     std::vector<int32_t> inputs;
-    for (const string& input : op->inputs) {
+    for (const std::string& input : op->inputs) {
       // -1 is the ID for optional tensor in TFLite output
       int id = model.IsOptionalArray(input) ? -1 : tensors_map.at(input);
       inputs.push_back(id);
     }
     std::vector<int32_t> outputs;
-    for (const string& output : op->outputs) {
+    for (const std::string& output : op->outputs) {
       outputs.push_back(tensors_map.at(output));
     }
     const toco::OperatorSignature op_signature = {op.get(), &model};
@@ -428,15 +428,15 @@ Offset<Vector<Offset<Buffer>>> ExportBuffers(
   return builder->CreateVector(buffer_vector);
 }
 
-tensorflow::Status Export(const Model& model, string* output_file_contents,
+tensorflow::Status Export(const Model& model, std::string* output_file_contents,
                           const ExportParams& params) {
   const auto ops_by_type = BuildOperatorByTypeMap(params.enable_select_tf_ops);
   return Export(model, output_file_contents, params, ops_by_type);
 }
 
-void ParseControlFlowErrors(std::set<string>* custom_ops,
-                            std::vector<string>* error_msgs) {
-  std::set<string> unsupported_control_flow_ops;
+void ParseControlFlowErrors(std::set<std::string>* custom_ops,
+                            std::vector<std::string>* error_msgs) {
+  std::set<std::string> unsupported_control_flow_ops;
   // Check if unsupported ops contains control flow ops. It's impossible
   // to implement these ops as custom ops at the moment.
   for (const auto& op : *custom_ops) {
@@ -471,10 +471,10 @@ void ExportModelVersionBuffer(
 }
 
 tensorflow::Status Export(
-    const Model& model, string* output_file_contents,
+    const Model& model, std::string* output_file_contents,
     const ExportParams& params,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
-  for (const string& input_array : model.GetInvalidInputArrays()) {
+  for (const std::string& input_array : model.GetInvalidInputArrays()) {
     if (model.HasArray(input_array)) {
       return tensorflow::errors::InvalidArgument(
           absl::StrCat("Placeholder ", input_array,
@@ -509,11 +509,11 @@ tensorflow::Status Export(
   }
 
   // The set of used builtin ops.
-  std::set<string> builtin_ops;
+  std::set<std::string> builtin_ops;
   // The set of custom ops (not including Flex ops).
-  std::set<string> custom_ops;
+  std::set<std::string> custom_ops;
   // The set of Flex ops which are not supported.
-  std::set<string> unsupported_flex_ops;
+  std::set<std::string> unsupported_flex_ops;
 
   for (const auto& it : operators_map) {
     const details::OperatorKey& key = it.first;
@@ -540,7 +540,7 @@ tensorflow::Status Export(
                "40-tflite-op-request.md\n and pasting the following:\n\n";
       };
 
-      std::vector<string> error_msgs;
+      std::vector<std::string> error_msgs;
       ParseControlFlowErrors(&custom_ops, &error_msgs);
 
       // Remove ExpandDims and ReorderAxes from unimplemented list unless they
@@ -549,7 +549,7 @@ tensorflow::Status Export(
       // transformation is unable to run because the output shape is not
       // defined. This causes unnecessary confusion during model conversion
       // time.
-      std::set<string> custom_ops_final;
+      std::set<std::string> custom_ops_final;
       for (const auto& op_type : custom_ops) {
         if (op_type != "ReorderAxes" && op_type != "ExpandDims") {
           custom_ops_final.insert(op_type);
diff --git a/tensorflow/lite/toco/tflite/export.h b/tensorflow/lite/toco/tflite/export.h
index 3af77ffcf43..64f7c7b128f 100644
--- a/tensorflow/lite/toco/tflite/export.h
+++ b/tensorflow/lite/toco/tflite/export.h
@@ -35,19 +35,19 @@ struct ExportParams {
 
 // Transform the given tf.mini model into a TF Lite flatbuffer and deposit the
 // result in the given string.
-tensorflow::Status Export(const Model& model, string* output_file_contents,
+tensorflow::Status Export(const Model& model, std::string* output_file_contents,
                           const ExportParams& params);
 
 // Export API with custom TFLite operator mapping.
 tensorflow::Status Export(
-    const Model& model, string* output_file_contents,
+    const Model& model, std::string* output_file_contents,
     const ExportParams& params,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
 
 // This is for backward-compatibility.
 // TODO(ycling): Remove the deprecated entry functions.
 inline void Export(const Model& model, bool allow_custom_ops,
-                   bool quantize_weights, string* output_file_contents) {
+                   bool quantize_weights, std::string* output_file_contents) {
   ExportParams params;
   params.allow_custom_ops = allow_custom_ops;
   params.quantize_weights =
@@ -60,7 +60,7 @@ inline void Export(const Model& model, bool allow_custom_ops,
 // TODO(ycling): Remove the deprecated entry functions.
 inline void Export(
     const Model& model, bool allow_custom_ops, bool quantize_weights,
-    string* output_file_contents,
+    std::string* output_file_contents,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
   ExportParams params;
   params.allow_custom_ops = allow_custom_ops;
@@ -72,7 +72,7 @@ inline void Export(
 
 // This is for backward-compatibility.
 // TODO(ycling): Remove the deprecated entry functions.
-inline void Export(const Model& model, string* output_file_contents) {
+inline void Export(const Model& model, std::string* output_file_contents) {
   ExportParams params;
   params.allow_custom_ops = true;
   auto status = Export(model, output_file_contents, params);
@@ -82,7 +82,7 @@ inline void Export(const Model& model, string* output_file_contents) {
 namespace details {
 
 // A map from tensor name to its final position in the TF Lite buffer.
-using TensorsMap = std::unordered_map<string, int>;
+using TensorsMap = std::unordered_map<std::string, int>;
 
 // A key to identify an operator.
 // Only when `type` is `kUnsupported`, `custom_code` is filled to
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index 19b77543c66..ed347a28d51 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -34,13 +34,13 @@ using ::testing::HasSubstr;
 class ExportTest : public ::testing::Test {
  protected:
   void ResetOperators() { input_model_.operators.clear(); }
-  void AddTensorsByName(std::initializer_list<string> names) {
-    for (const string& name : names) {
+  void AddTensorsByName(std::initializer_list<std::string> names) {
+    for (const std::string& name : names) {
       input_model_.GetOrCreateArray(name);
     }
   }
-  void AddOperatorsByName(std::initializer_list<string> names) {
-    for (const string& name : names) {
+  void AddOperatorsByName(std::initializer_list<std::string> names) {
+    for (const std::string& name : names) {
       if (name == "Conv") {
         auto* op = new ConvOperator;
         op->padding.type = PaddingType::kSame;
@@ -153,14 +153,15 @@ class ExportTest : public ::testing::Test {
   }
 
   tensorflow::Status ExportAndReturnStatus(const ExportParams& params) {
-    string result;
+    std::string result;
     return Export(input_model_, &result, params);
   }
 
-  std::vector<string> ExportAndSummarizeOperators(const ExportParams& params) {
-    std::vector<string> names;
+  std::vector<std::string> ExportAndSummarizeOperators(
+      const ExportParams& params) {
+    std::vector<std::string> names;
 
-    string result;
+    std::string result;
     auto status = Export(input_model_, &result, params);
     if (!status.ok()) {
       LOG(INFO) << status.error_message();
@@ -171,10 +172,12 @@ class ExportTest : public ::testing::Test {
 
     for (const ::tflite::OperatorCode* opcode : *model->operator_codes()) {
       if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
-        names.push_back(string("builtin:") + ::tflite::EnumNameBuiltinOperator(
-                                                 opcode->builtin_code()));
+        names.push_back(
+            std::string("builtin:") +
+            ::tflite::EnumNameBuiltinOperator(opcode->builtin_code()));
       } else {
-        names.push_back(string("custom:") + opcode->custom_code()->c_str());
+        names.push_back(std::string("custom:") +
+                        opcode->custom_code()->c_str());
       }
     }
 
@@ -185,7 +188,7 @@ class ExportTest : public ::testing::Test {
       const ExportParams& params) {
     std::vector<uint32_t> indices;
 
-    string result;
+    std::string result;
     if (!Export(input_model_, &result, params).ok()) return indices;
     auto* model = ::tflite::GetModel(result.data());
 
@@ -257,7 +260,7 @@ TEST_F(ExportTest, ExportMinRuntime) {
   params.enable_select_tf_ops = false;
   params.quantize_weights = QuantizedBufferType::NONE;
 
-  string output;
+  std::string output;
   auto status = Export(input_model_, &output, params);
   auto* model = ::tflite::GetModel(output.data());
   EXPECT_EQ(model->metadata()->size(), 1);
@@ -265,7 +268,8 @@ TEST_F(ExportTest, ExportMinRuntime) {
   auto buf = model->metadata()->Get(0)->buffer();
   auto* buffer = (*model->buffers())[buf];
   auto* array = buffer->data();
-  string version(reinterpret_cast<const char*>(array->data()), array->size());
+  std::string version(reinterpret_cast<const char*>(array->data()),
+                      array->size());
   EXPECT_EQ(version, "1.6.0");
 }
 
@@ -275,7 +279,7 @@ TEST_F(ExportTest, ExportEmptyMinRuntime) {
   ExportParams params;
   params.allow_custom_ops = true;
 
-  string output;
+  std::string output;
   auto status = Export(input_model_, &output, params);
   auto* model = ::tflite::GetModel(output.data());
   EXPECT_EQ(model->metadata()->size(), 1);
@@ -283,7 +287,8 @@ TEST_F(ExportTest, ExportEmptyMinRuntime) {
   auto buf = model->metadata()->Get(0)->buffer();
   auto* buffer = (*model->buffers())[buf];
   auto* array = buffer->data();
-  string version(reinterpret_cast<const char*>(array->data()), array->size());
+  std::string version(reinterpret_cast<const char*>(array->data()),
+                      array->size());
   EXPECT_EQ(version, "");
 }
 
@@ -296,7 +301,7 @@ TEST_F(ExportTest, UnsupportedControlFlowErrors) {
   // The model contains control flow ops which are not convertible, so we should
   // check the returned error message.
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(status.error_message(),
@@ -318,7 +323,7 @@ TEST_F(ExportTest, UnsupportedOpsAndNeedEnableFlex) {
   params.allow_custom_ops = false;
   params.enable_select_tf_ops = false;
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
@@ -348,7 +353,7 @@ TEST_F(ExportTest, UnsupportedOpsNeedCustomImplementation) {
   params.allow_custom_ops = false;
   params.enable_select_tf_ops = true;
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
@@ -378,7 +383,7 @@ TEST_F(ExportTest, UnsupportedControlFlowAndCustomOpsErrors) {
   // The model contains control flow ops which are not convertible, so we should
   // check the returned error message.
 
-  string output;
+  std::string output;
   const auto ops_by_type = BuildOperatorByTypeMap();
   auto status = Export(input_model_, &output, params, ops_by_type);
   EXPECT_EQ(
@@ -407,11 +412,11 @@ TEST_F(ExportTest, UnsupportedControlFlowAndCustomOpsErrors) {
 TEST_F(ExportTest, QuantizeWeights) {
   // Sanity check for quantize_weights parameter.
   BuildQuantizableTestModel();
-  string unquantized_result;
+  std::string unquantized_result;
   Export(input_model_, true, /*quantize_weights*/ false, &unquantized_result);
 
   BuildQuantizableTestModel();
-  string quantized_result;
+  std::string quantized_result;
   Export(input_model_, true, /*quantize_weights*/ true, &quantized_result);
 
   // The quantized models should be smaller.
@@ -443,12 +448,13 @@ class OpSetsTest : public ExportTest {
     }
   }
 
-  std::vector<string> ImportExport(std::initializer_list<string> op_names) {
+  std::vector<std::string> ImportExport(
+      std::initializer_list<std::string> op_names) {
     ResetOperators();
     if (!import_all_ops_as_unsupported_) {
       AddOperatorsByName(op_names);
     } else {
-      for (const string& name : op_names) {
+      for (const std::string& name : op_names) {
         auto* op = new TensorFlowUnsupportedOperator;
         op->tensorflow_op = name;
         input_model_.operators.emplace_back(op);
@@ -644,7 +650,7 @@ TEST_F(VersionedOpExportTest, Export) {
   AddConvOp(false);
   AddConvOp(true);
 
-  string result;
+  std::string result;
   const auto ops_by_type = BuildFakeOperatorByTypeMap();
   Export(input_model_, true, false, &result, ops_by_type);
 
diff --git a/tensorflow/lite/toco/tflite/import.cc b/tensorflow/lite/toco/tflite/import.cc
index 0f3dd48652e..136aa4ffaa8 100644
--- a/tensorflow/lite/toco/tflite/import.cc
+++ b/tensorflow/lite/toco/tflite/import.cc
@@ -99,7 +99,7 @@ void ImportTensors(const ::tflite::Model& input_model, Model* model) {
 
 void ImportOperators(
     const ::tflite::Model& input_model,
-    const std::map<string, std::unique_ptr<BaseOperator>>& ops_by_name,
+    const std::map<std::string, std::unique_ptr<BaseOperator>>& ops_by_name,
     const details::TensorsTable& tensors_table,
     const details::OperatorsTable& operators_table, Model* model) {
   // TODO(aselle): add support for multiple subgraphs.
@@ -112,12 +112,12 @@ void ImportOperators(
       LOG(FATAL) << "Index " << index << " must be between zero and "
                  << operators_table.size();
     }
-    string opname = operators_table.at(index);
+    std::string opname = operators_table.at(index);
 
     // Find and use the appropriate operator deserialization factory.
     std::unique_ptr<Operator> new_op = nullptr;
     if (ops_by_name.count(opname) == 0) {
-      string effective_opname = "TENSORFLOW_UNSUPPORTED";
+      std::string effective_opname = "TENSORFLOW_UNSUPPORTED";
       if (ops_by_name.count(effective_opname) == 0) {
         LOG(FATAL) << "Internal logic error: TENSORFLOW_UNSUPPORTED not found.";
       }
@@ -147,10 +147,10 @@ void ImportOperators(
       auto input_index = inputs->Get(i);
       // input_index == -1 indicates optional tensor.
       if (input_index != -1) {
-        const string& input_name = tensors_table.at(input_index);
+        const std::string& input_name = tensors_table.at(input_index);
         op->inputs.push_back(input_name);
       } else {
-        const string& tensor_name =
+        const std::string& tensor_name =
             toco::AvailableArrayName(*model, "OptionalTensor");
         model->CreateOptionalArray(tensor_name);
         op->inputs.push_back(tensor_name);
@@ -159,7 +159,7 @@ void ImportOperators(
     auto outputs = input_op->outputs();
     for (int i = 0; i < outputs->Length(); i++) {
       auto output_index = outputs->Get(i);
-      const string& output_name = tensors_table.at(output_index);
+      const std::string& output_name = tensors_table.at(output_index);
       op->outputs.push_back(output_name);
     }
   }
@@ -173,7 +173,7 @@ void ImportIOTensors(const ModelFlags& model_flags,
     auto inputs = (*input_model.subgraphs())[0]->inputs();
     if (inputs) {
       for (int input : *inputs) {
-        const string& input_name = tensors_table.at(input);
+        const std::string& input_name = tensors_table.at(input);
         model->flags.add_input_arrays()->set_name(input_name);
       }
     }
@@ -184,7 +184,7 @@ void ImportIOTensors(const ModelFlags& model_flags,
     auto outputs = (*input_model.subgraphs())[0]->outputs();
     if (outputs) {
       for (int output : *outputs) {
-        const string& output_name = tensors_table.at(output);
+        const std::string& output_name = tensors_table.at(output);
         model->flags.add_output_arrays(output_name);
       }
     }
@@ -199,7 +199,7 @@ bool Verify(const void* buf, size_t len) {
 }  // namespace
 
 std::unique_ptr<Model> Import(const ModelFlags& model_flags,
-                              const string& input_file_contents) {
+                              const std::string& input_file_contents) {
   ::tflite::AlwaysTrueResolver r;
   if (!::tflite::Verify(input_file_contents.data(), input_file_contents.size(),
                         r, ::tflite::DefaultErrorReporter())) {
diff --git a/tensorflow/lite/toco/tflite/import.h b/tensorflow/lite/toco/tflite/import.h
index f5de3b53b5b..bac55aae8b9 100644
--- a/tensorflow/lite/toco/tflite/import.h
+++ b/tensorflow/lite/toco/tflite/import.h
@@ -24,17 +24,17 @@ namespace tflite {
 
 // Parse the given string as TF Lite flatbuffer and return a new tf.mini model.
 std::unique_ptr<Model> Import(const ModelFlags &model_flags,
-                              const string &input_file_contents);
+                              const std::string &input_file_contents);
 
 namespace details {
 
 // The names of all tensors found in a TF Lite model.
-using TensorsTable = std::vector<string>;
+using TensorsTable = std::vector<std::string>;
 
 // The names of all operators found in TF Lite model. If the operator is
 // builtin, the string representation of the corresponding enum value is used
 // as name.
-using OperatorsTable = std::vector<string>;
+using OperatorsTable = std::vector<std::string>;
 
 void LoadTensorsTable(const ::tflite::Model &input_model,
                       TensorsTable *tensors_table);
diff --git a/tensorflow/lite/toco/tflite/import_test.cc b/tensorflow/lite/toco/tflite/import_test.cc
index b00c4124d83..6163ebab45b 100644
--- a/tensorflow/lite/toco/tflite/import_test.cc
+++ b/tensorflow/lite/toco/tflite/import_test.cc
@@ -134,9 +134,9 @@ class ImportTest : public ::testing::Test {
 
     input_model_ = ::tflite::GetModel(builder_.GetBufferPointer());
   }
-  string InputModelAsString() {
-    return string(reinterpret_cast<char*>(builder_.GetBufferPointer()),
-                  builder_.GetSize());
+  std::string InputModelAsString() {
+    return std::string(reinterpret_cast<char*>(builder_.GetBufferPointer()),
+                       builder_.GetSize());
   }
   flatbuffers::FlatBufferBuilder builder_;
   const ::tflite::Model* input_model_ = nullptr;
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index cf127a9f459..efa53c69cae 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -29,7 +29,7 @@ namespace tflite {
 
 // Deprecated and please register new ops/versions in
 // tflite/tools/versioning/op_version.cc".
-string GetMinimumRuntimeVersionForModel(const Model& model) {
+std::string GetMinimumRuntimeVersionForModel(const Model& model) {
   // Use this as the placeholder string if a particular op is not yet included
   // in any Tensorflow's RC/Final release source package. Once that op is
   // included in the release, please update this with the real version string.
@@ -37,8 +37,8 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
   // A map from the version key of an op to its minimum runtime version.
   // For example, {{kAveragePool, 1}, "1.5.0"},  means the 1st version of
   // AveragePool requires a minimum TF Lite runtime version '1.5.0`.
-  static const std::map<std::pair<OperatorType, int>, string>* op_version_map =
-      new std::map<std::pair<OperatorType, int>, string>({
+  static const std::map<std::pair<OperatorType, int>, std::string>*
+      op_version_map = new std::map<std::pair<OperatorType, int>, std::string>({
           {{OperatorType::kAveragePool, 1}, "1.5.0"},
           {{OperatorType::kAveragePool, 2}, "1.14.0"},
           {{OperatorType::kAveragePool, 3}, kPendingReleaseOpVersion},
@@ -253,7 +253,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
       tflite::BuildOperatorByTypeMap(false /*enable_select_tf_ops=*/);
   OperatorSignature op_signature;
   op_signature.model = &model;
-  string model_min_version;
+  std::string model_min_version;
   for (const auto& op : model.operators) {
     if (op_types_map.find(op->type) == op_types_map.end()) continue;
     op_signature.op = op.get();
diff --git a/tensorflow/lite/toco/tflite/op_version_test.cc b/tensorflow/lite/toco/tflite/op_version_test.cc
index 14b086471b7..8466fc35ad7 100644
--- a/tensorflow/lite/toco/tflite/op_version_test.cc
+++ b/tensorflow/lite/toco/tflite/op_version_test.cc
@@ -27,9 +27,9 @@ TEST(OpVersionTest, MinimumVersionForSameOpVersions) {
   Model model;
   // Float convolutional kernel is introduced since '1.5.0'.
   std::unique_ptr<ConvOperator> conv(new ConvOperator());
-  const string conv_input = "conv_input";
-  const string conv_filter = "conv_filter";
-  const string conv_output = "conv_output";
+  const std::string conv_input = "conv_input";
+  const std::string conv_filter = "conv_filter";
+  const std::string conv_output = "conv_output";
   conv->inputs.push_back(conv_input);
   conv->inputs.push_back(conv_filter);
   conv->outputs.push_back(conv_output);
@@ -44,8 +44,8 @@ TEST(OpVersionTest, MinimumVersionForSameOpVersions) {
 
   // Float softmax kernel is introduced since '1.5.0'.
   std::unique_ptr<SoftmaxOperator> softmax(new SoftmaxOperator());
-  const string softmax_input = "softmax_input";
-  const string softmax_output = "softmax_output";
+  const std::string softmax_input = "softmax_input";
+  const std::string softmax_output = "softmax_output";
   softmax->inputs.push_back(softmax_input);
   softmax->outputs.push_back(softmax_output);
   array_map[softmax_input] = std::unique_ptr<Array>(new Array);
@@ -60,9 +60,9 @@ TEST(OpVersionTest, MinimumVersionForMultipleOpVersions) {
   Model model;
   // Dilated DepthWiseConvolution is introduced since '1.12.0'.
   std::unique_ptr<DepthwiseConvOperator> conv(new DepthwiseConvOperator());
-  const string conv_input = "conv_input";
-  const string conv_filter = "conv_filter";
-  const string conv_output = "conv_output";
+  const std::string conv_input = "conv_input";
+  const std::string conv_filter = "conv_filter";
+  const std::string conv_output = "conv_output";
   conv->inputs.push_back(conv_input);
   conv->inputs.push_back(conv_filter);
   conv->outputs.push_back(conv_output);
@@ -77,10 +77,10 @@ TEST(OpVersionTest, MinimumVersionForMultipleOpVersions) {
   // FullyConnected op with kShuffled4x16Int8 weight format is introduced from
   // '1.10.0'.
   std::unique_ptr<FullyConnectedOperator> fc(new FullyConnectedOperator());
-  const string fc_input = "fc_input";
-  const string fc_weights = "fc_weights";
-  const string fc_bias = "fc_bias";
-  const string fc_output = "fc_output";
+  const std::string fc_input = "fc_input";
+  const std::string fc_weights = "fc_weights";
+  const std::string fc_bias = "fc_bias";
+  const std::string fc_output = "fc_output";
   fc->inputs.push_back(fc_input);
   fc->inputs.push_back(fc_weights);
   fc->inputs.push_back(fc_bias);
@@ -121,10 +121,10 @@ TEST(OpVersionTest, MinimumVersionForMixedOpVersions) {
   // FullyConnected op with kShuffled4x16Int8 weight format is introduced from
   // '1.10.0'.
   std::unique_ptr<FullyConnectedOperator> fc(new FullyConnectedOperator());
-  const string fc_input = "fc_input";
-  const string fc_weights = "fc_weights";
-  const string fc_bias = "fc_bias";
-  const string fc_output = "fc_output";
+  const std::string fc_input = "fc_input";
+  const std::string fc_weights = "fc_weights";
+  const std::string fc_bias = "fc_bias";
+  const std::string fc_output = "fc_output";
   fc->inputs.push_back(fc_input);
   fc->inputs.push_back(fc_weights);
   fc->inputs.push_back(fc_bias);
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index fee10a19787..be539cf6054 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -238,7 +238,7 @@ class SpaceToBatchND
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input_name = op_signature.op->inputs[0];
+    const std::string& input_name = op_signature.op->inputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
@@ -268,8 +268,8 @@ class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input1_name = op_signature.op->inputs[0];
-    const string& input2_name = op_signature.op->inputs[1];
+    const std::string& input1_name = op_signature.op->inputs[0];
+    const std::string& input2_name = op_signature.op->inputs[1];
     const Array& input1_array = op_signature.model->GetArray(input1_name);
     const Array& input2_array = op_signature.model->GetArray(input2_name);
     ::tflite::OpSignature op_sig =
@@ -305,8 +305,8 @@ class Div : public BuiltinOperator<DivOperator, ::tflite::DivOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input1_name = op_signature.op->inputs[0];
-    const string& input2_name = op_signature.op->inputs[1];
+    const std::string& input1_name = op_signature.op->inputs[0];
+    const std::string& input2_name = op_signature.op->inputs[1];
     const Array& input1_array = op_signature.model->GetArray(input1_name);
     const Array& input2_array = op_signature.model->GetArray(input2_name);
     ::tflite::OpSignature op_sig =
@@ -339,7 +339,7 @@ class BatchToSpaceND
                    TocoOperator* op) const override {}
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input_name = op_signature.op->inputs[0];
+    const std::string& input_name = op_signature.op->inputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
@@ -662,9 +662,9 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input1_name = op_signature.op->inputs[0];
-    const string& input2_name = op_signature.op->inputs[1];
-    const string& output_name = op_signature.op->outputs[0];
+    const std::string& input1_name = op_signature.op->inputs[0];
+    const std::string& input2_name = op_signature.op->inputs[1];
+    const std::string& output_name = op_signature.op->outputs[0];
     const Array& input1_array = op_signature.model->GetArray(input1_name);
     const Array& input2_array = op_signature.model->GetArray(input2_name);
     const Array& output_array = op_signature.model->GetArray(output_name);
@@ -1440,7 +1440,7 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
   }
 
   int GetVersion(const OperatorSignature& op_signature) const override {
-    const string& input_name = op_signature.op->inputs[0];
+    const std::string& input_name = op_signature.op->inputs[0];
     const Array& input_array = op_signature.model->GetArray(input_name);
     // If the op take int8/uint8 input, it is version 2.
     if (input_array.data_type == ArrayDataType::kInt8 ||
@@ -1577,7 +1577,7 @@ class Where : public BuiltinOperator<WhereOperator, ::tflite::WhereOptions,
 };
 
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
-    const string& tensorflow_node_def) {
+    const std::string& tensorflow_node_def) {
   auto fbb = absl::make_unique<flexbuffers::Builder>();
 
   ::tensorflow::NodeDef node_def;
@@ -1597,7 +1597,7 @@ std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
 
 class TensorFlowUnsupported : public BaseOperator {
  public:
-  TensorFlowUnsupported(const string& name, OperatorType type,
+  TensorFlowUnsupported(const std::string& name, OperatorType type,
                         bool enable_select_tf_ops)
       : BaseOperator(name, type), enable_select_tf_ops_(enable_select_tf_ops) {}
 
@@ -1676,7 +1676,7 @@ class TensorFlowUnsupported : public BaseOperator {
         case tensorflow::AttrValue::kList:
           if (attr.list().s_size() > 0) {
             auto start = fbb->StartVector(key);
-            for (const string& v : attr.list().s()) {
+            for (const std::string& v : attr.list().s()) {
               fbb->Add(v);
             }
             fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
@@ -1736,10 +1736,11 @@ class TensorFlowUnsupported : public BaseOperator {
           break;
         case flexbuffers::FBT_BOOL:
           (*attr)[key].set_b(value.AsBool());
-          if (string(key) == "_output_quantized") {
+          if (std::string(key) == "_output_quantized") {
             op->quantized = value.AsBool();
           }
-          if (string(key) == "_support_output_type_float_in_quantized_op") {
+          if (std::string(key) ==
+              "_support_output_type_float_in_quantized_op") {
             op->support_output_type_float_in_quantized_op = value.AsBool();
           }
           break;
@@ -2095,9 +2096,9 @@ std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
   return result;
 }
 
-std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
+std::map<std::string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
     bool enable_select_tf_ops) {
-  std::map<string, std::unique_ptr<BaseOperator>> result;
+  std::map<std::string, std::unique_ptr<BaseOperator>> result;
 
   std::vector<std::unique_ptr<BaseOperator>> ops =
       BuildOperatorList(enable_select_tf_ops);
@@ -2109,7 +2110,7 @@ std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
 }
 
 bool ShouldExportAsFlexOp(bool enable_select_tf_ops,
-                          const string& tensorflow_op_name) {
+                          const std::string& tensorflow_op_name) {
   // If Flex ops aren't allow at all, simply return false.
   if (!enable_select_tf_ops) {
     return false;
diff --git a/tensorflow/lite/toco/tflite/operator.h b/tensorflow/lite/toco/tflite/operator.h
index 19d92145e0c..fb79b97f46e 100644
--- a/tensorflow/lite/toco/tflite/operator.h
+++ b/tensorflow/lite/toco/tflite/operator.h
@@ -30,7 +30,7 @@ class BaseOperator;
 // Return a map contained all know TF Lite Operators, keyed by their names.
 // TODO(ycling): The pattern to propagate parameters (e.g. enable_select_tf_ops)
 // is ugly here. Consider refactoring.
-std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
+std::map<std::string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
     bool enable_select_tf_ops = false);
 
 // Return a map contained all know TF Lite Operators, keyed by the type of
@@ -41,7 +41,7 @@ std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
 // Write the custom option FlexBuffer with a serialized TensorFlow NodeDef
 // for a Flex op.
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
-    const string& tensorflow_node_def);
+    const std::string& tensorflow_node_def);
 
 // These are the flatbuffer types for custom and builtin options.
 using CustomOptions = flatbuffers::Vector<uint8_t>;
@@ -71,11 +71,11 @@ struct Options {
 class BaseOperator {
  public:
   // Build an operator with the given TF Lite name and tf.mini type.
-  BaseOperator(const string& name, OperatorType type)
+  BaseOperator(const std::string& name, OperatorType type)
       : name_(name), type_(type) {}
   virtual ~BaseOperator() = default;
 
-  string name() const { return name_; }
+  std::string name() const { return name_; }
   OperatorType type() const { return type_; }
 
   // Given a tf.mini operator, create the corresponding flatbuffer options and
@@ -111,7 +111,7 @@ class BaseOperator {
   }
 
  private:
-  string name_;
+  std::string name_;
   OperatorType type_;
 };
 
@@ -123,7 +123,7 @@ class BaseOperator {
 // Helper function to determine if a unsupported TensorFlow op should be
 // exported as an Flex op or a regular custom op.
 bool ShouldExportAsFlexOp(bool enable_select_tf_ops,
-                          const string& tensorflow_op_name);
+                          const std::string& tensorflow_op_name);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index a4fe01e4afd..cb466fef079 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -30,8 +30,8 @@ namespace {
 class OperatorTest : public ::testing::Test {
  protected:
   // Return the operator for the given name and type.
-  const BaseOperator& GetOperator(const string& name, OperatorType type) {
-    using OpsByName = std::map<string, std::unique_ptr<BaseOperator>>;
+  const BaseOperator& GetOperator(const std::string& name, OperatorType type) {
+    using OpsByName = std::map<std::string, std::unique_ptr<BaseOperator>>;
     using OpsByType = std::map<OperatorType, std::unique_ptr<BaseOperator>>;
 
     static auto* by_name = new OpsByName(BuildOperatorByNameMap());
@@ -86,7 +86,7 @@ class OperatorTest : public ::testing::Test {
   // Verify serialization and deserialization of simple operators (those
   // that don't have any configuration parameters).
   template <typename T>
-  void CheckSimpleOperator(const string& name, OperatorType type) {
+  void CheckSimpleOperator(const std::string& name, OperatorType type) {
     Options options;
     auto output_toco_op =
         SerializeAndDeserialize(GetOperator(name, type), T(), &options);
@@ -99,7 +99,7 @@ class OperatorTest : public ::testing::Test {
   }
 
   template <typename T>
-  void CheckReducerOperator(const string& name, OperatorType type) {
+  void CheckReducerOperator(const std::string& name, OperatorType type) {
     T op;
 
     op.keep_dims = false;
diff --git a/tensorflow/lite/toco/tflite/types.cc b/tensorflow/lite/toco/tflite/types.cc
index 96cad557baf..9d4ab8434d1 100644
--- a/tensorflow/lite/toco/tflite/types.cc
+++ b/tensorflow/lite/toco/tflite/types.cc
@@ -25,7 +25,7 @@ DataBuffer::FlatBufferOffset CopyStringToBuffer(
     const Array& array, flatbuffers::FlatBufferBuilder* builder) {
   const auto& src_data = array.GetBuffer<ArrayDataType::kString>().data;
   ::tflite::DynamicBuffer dyn_buffer;
-  for (const string& str : src_data) {
+  for (const std::string& str : src_data) {
     dyn_buffer.AddString(str.c_str(), str.length());
   }
   char* tensor_buffer;
@@ -58,12 +58,12 @@ DataBuffer::FlatBufferOffset CopyBuffer(
 
 void CopyStringFromBuffer(const ::tflite::Buffer& buffer, Array* array) {
   auto* src_data = reinterpret_cast<const char*>(buffer.data()->data());
-  std::vector<string>* dst_data =
+  std::vector<std::string>* dst_data =
       &array->GetMutableBuffer<ArrayDataType::kString>().data;
   int32_t num_strings = ::tflite::GetStringCount(src_data);
   for (int i = 0; i < num_strings; i++) {
     ::tflite::StringRef str_ref = ::tflite::GetString(src_data, i);
-    string this_str(str_ref.str, str_ref.len);
+    std::string this_str(str_ref.str, str_ref.len);
     dst_data->push_back(this_str);
   }
 }

From 642ad434d8315561ef9cc02cc9157436fe9c0f72 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <jingyue@google.com>
Date: Fri, 19 Jun 2020 09:50:56 -0700
Subject: [PATCH 111/112] Fix exports_files.

cl/317237033 replaced cwise_op_neg.cc with cwise_op_neg_1.cc and
cwise_op_neg_2.cc.

PiperOrigin-RevId: 317325461
Change-Id: Ib44ff36474b7e55d9e84ff737ea82b9dac46b9f9
---
 tensorflow/core/kernels/BUILD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0b7a092033b..e2ff5aed283 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -8802,7 +8802,8 @@ exports_files([
     "cwise_op_mod.cc",
     "cwise_op_mul_1.cc",
     "cwise_op_mul_2.cc",
-    "cwise_op_neg.cc",
+    "cwise_op_neg_1.cc",
+    "cwise_op_neg_2.cc",
     "cwise_op_not_equal_to_1.cc",
     "cwise_op_not_equal_to_2.cc",
     "cwise_op_round.cc",

From c3cc3c40b08a37535a281e1d7a5fd7d3d802aac6 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Fri, 19 Jun 2020 09:55:49 -0700
Subject: [PATCH 112/112] Move fuzzers for TF ops to own subdir. Trim some
 dependencies.

This duplicates some of the BUILD dependency tree to go around the need to link huge bottleneck dependencies (such as `//tensorflow/core:framework`). Until TF can use `cc_shared_library` in a stable way (and all support in Bazel exists), we will need to use the duplicated tree for fuzzing.

PiperOrigin-RevId: 317326319
Change-Id: I1493e3ae7340298971fe15bd3702b63657f9bf9f
---
 tensorflow/core/framework/BUILD               |   1 +
 tensorflow/security/fuzzing/BUILD             |  14 --
 tensorflow/security/fuzzing/op_fuzzing/BUILD  |  39 +++++
 .../fuzzing/op_fuzzing/fuzz_session.h         | 156 ++++++++++++++++++
 .../fuzzing/{ => op_fuzzing}/identity_fuzz.cc |   2 +-
 5 files changed, 197 insertions(+), 15 deletions(-)
 create mode 100644 tensorflow/security/fuzzing/op_fuzzing/BUILD
 create mode 100644 tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
 rename tensorflow/security/fuzzing/{ => op_fuzzing}/identity_fuzz.cc (95%)

diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 52f15dcb5c2..d47c74a629d 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -719,6 +719,7 @@ tf_cuda_library(
     visibility = [
         "//tensorflow/core:__pkg__",
         "//tensorflow/core/util:__pkg__",
+        "//tensorflow/security/fuzzing:__subpackages__",
     ],
     deps = [
         ":allocation_description_proto_cc",
diff --git a/tensorflow/security/fuzzing/BUILD b/tensorflow/security/fuzzing/BUILD
index 9b5aeec2d36..6b6c8275275 100644
--- a/tensorflow/security/fuzzing/BUILD
+++ b/tensorflow/security/fuzzing/BUILD
@@ -18,17 +18,3 @@ tf_fuzz_target(
         "//tensorflow/core/platform:status",
     ],
 )
-
-# A trivial fuzzer with no pre-specified corpus.
-# TODO(mihaimaruseac): Move fuzz_session and the op fuzzers to a subdirectory
-tf_fuzz_target(
-    name = "identity_fuzz",
-    srcs = ["identity_fuzz.cc"],
-    deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels/fuzzing:fuzz_session",
-        # Needed only to transitiviely link dependencies
-        "//tensorflow/cc:scope",
-        "//tensorflow/core:core_cpu",
-    ],
-)
diff --git a/tensorflow/security/fuzzing/op_fuzzing/BUILD b/tensorflow/security/fuzzing/op_fuzzing/BUILD
new file mode 100644
index 00000000000..aacd2f16cc4
--- /dev/null
+++ b/tensorflow/security/fuzzing/op_fuzzing/BUILD
@@ -0,0 +1,39 @@
+# Fuzzing TensorFlow ops..
+# Most ops have a similar set of dependencies and a similar fuzzing
+# infrastructure. Hence, we gather everything in one single place.
+# Note that these fuzzers cover a large part of TF, they are not granular.
+
+load(
+    "//tensorflow/security/fuzzing:tf_fuzzing.bzl",
+    "tf_fuzz_target",
+)
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Since all ops need to have a graph created before being fuzzed, we define
+# this header-only library to handle the needed plumbing.
+cc_library(
+    name = "fuzz_session",
+    hdrs = ["fuzz_session.h"],
+    deps = [
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/common_runtime:direct_session_internal",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/platform:status",
+    ],
+)
+
+# A trivial fuzzer with no pre-specified corpus.
+tf_fuzz_target(
+    name = "identity_fuzz",
+    srcs = ["identity_fuzz.cc"],
+    deps = [
+        ":fuzz_session",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core/kernels:array",
+    ],
+)
diff --git a/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h b/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
new file mode 100644
index 00000000000..575212b3b86
--- /dev/null
+++ b/tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h
@@ -0,0 +1,156 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
+#define TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+// Standard invoking function macro to dispatch to a fuzzer class.
+#define STANDARD_TF_FUZZ_FUNCTION(FuzzerClass)                              \
+  extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { \
+    static FuzzerClass* fuzzer = new FuzzerClass();                         \
+    return fuzzer->Fuzz(data, size);                                        \
+  }
+
+// Standard builder for hooking one placeholder to one op.
+#define SINGLE_INPUT_OP_BUILDER(dtype, opName)                          \
+  void BuildGraph(const Scope& scope) override {                        \
+    auto op_node =                                                      \
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), dtype); \
+    (void)tensorflow::ops::opName(scope.WithOpName("output"), op_node); \
+  }
+
+namespace tensorflow {
+namespace fuzzing {
+
+// Create a TensorFlow session using a specific GraphDef created
+// by BuildGraph(), and make it available for fuzzing.
+// Users must override BuildGraph and FuzzImpl to specify
+// (1) which operations are being fuzzed; and
+// (2) How to translate the uint8_t* buffer from the fuzzer
+//     to a Tensor or Tensors that are semantically appropriate
+//     for the op under test.
+// For the simple cases of testing a single op that takes a single
+// input Tensor, use the SINGLE_INPUT_OP_BUILDER(dtype, opName) macro in place
+// of defining BuildGraphDef.
+//
+// Typical use:
+// class FooFuzzer : public FuzzSession {
+//   SINGLE_INPUT_OP_BUILDER(DT_INT8, Identity);
+//   void FuzzImpl(const uint8_t* data, size_t size) {
+//      ... convert data and size to a Tensor, pass it to:
+//      RunInputs({{"input", input_tensor}});
+//
+class FuzzSession {
+ public:
+  FuzzSession() : initialized_(false) {}
+  virtual ~FuzzSession() {}
+
+  // Constructs a Graph using the supplied Scope.
+  // By convention, the graph should have inputs named "input1", ...
+  // "inputN", and one output node, named "output".
+  // Users of FuzzSession should override this method to create their graph.
+  virtual void BuildGraph(const Scope& scope) = 0;
+
+  // Implements the logic that converts an opaque byte buffer
+  // from the fuzzer to Tensor inputs to the graph.  Users must override.
+  virtual void FuzzImpl(const uint8_t* data, size_t size) = 0;
+
+  // Initializes the FuzzSession.  Not safe for multithreading.
+  // Separate init function because the call to virtual BuildGraphDef
+  // can't be put into the constructor.
+  Status InitIfNeeded() {
+    if (initialized_) {
+      return Status::OK();
+    }
+    initialized_ = true;
+
+    Scope root = Scope::DisabledShapeInferenceScope().ExitOnError();
+    SessionOptions options;
+    session_ = std::unique_ptr<Session>(NewSession(options));
+
+    BuildGraph(root);
+
+    GraphDef graph_def;
+    TF_CHECK_OK(root.ToGraphDef(&graph_def));
+
+    Status status = session_->Create(graph_def);
+    if (!status.ok()) {
+      // This is FATAL, because this code is designed to fuzz an op
+      // within a session.  Failure to create the session means we
+      // can't send any data to the op.
+      LOG(FATAL) << "Could not create session: " << status.error_message();
+    }
+    return status;
+  }
+
+  // Runs the TF session by pulling on the "output" node, attaching
+  // the supplied input_tensor to the input node(s), and discarding
+  // any returned output.
+  // Note: We are ignoring Status from Run here since fuzzers don't need to
+  // check it (as that will slow them down and printing/logging is useless).
+  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+    RunInputsWithStatus(inputs).IgnoreError();
+  }
+
+  // Same as RunInputs but don't ignore status
+  Status RunInputsWithStatus(
+      const std::vector<std::pair<string, Tensor> >& inputs) {
+    return session_->Run(inputs, {}, {"output"}, nullptr);
+  }
+
+  // Dispatches to FuzzImpl;  small amount of sugar to keep the code
+  // of the per-op fuzzers tiny.
+  int Fuzz(const uint8_t* data, size_t size) {
+    Status status = InitIfNeeded();
+    TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
+                        << status.error_message();
+    // No return value from fuzzing:  Success is defined as "did not
+    // crash".  The actual application results are irrelevant.
+    FuzzImpl(data, size);
+    return 0;
+  }
+
+ private:
+  bool initialized_;
+  std::unique_ptr<Session> session_;
+};
+
+// A specialized fuzz implementation for ops that take
+// a single string.  Caller must still define the op
+// to plumb by overriding BuildGraph or using
+// a plumbing macro.
+class FuzzStringInputOp : public FuzzSession {
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    input_tensor.scalar<tstring>()() =
+        string(reinterpret_cast<const char*>(data), size);
+    RunInputs({{"input", input_tensor}});
+  }
+};
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_SECURITY_FUZZING_OP_FUZZING_FUZZ_SESSION_H_
diff --git a/tensorflow/security/fuzzing/identity_fuzz.cc b/tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
similarity index 95%
rename from tensorflow/security/fuzzing/identity_fuzz.cc
rename to tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
index 4c1049d381b..a63c35b45e2 100644
--- a/tensorflow/security/fuzzing/identity_fuzz.cc
+++ b/tensorflow/security/fuzzing/op_fuzzing/identity_fuzz.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+#include "tensorflow/security/fuzzing/op_fuzzing/fuzz_session.h"
 
 namespace tensorflow {
 namespace fuzzing {